avx2 specialization for gather32

This wraps up my recent interpreter optimization push. I've been informally measuring my progress by running viewer --slide GM_color_cube_rt --skvm --nojit --stats The frame cost has dropped from 13-14ms to just under 10ms; the JIT's right around 3ms. Change-Id: Ieff977ac7777dbbf15c8c3b7e40a0031fb95fe7d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/317718 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-09-18 08:02:46 -05:00 · 2020-09-18 08:02:46 -05:00 · 2e69a13d83
commit 2e69a13d83
parent 6798e5d0d1
1 changed files with 17 additions and 1 deletions
--- a/src/opts/SkVM_opts.h
+++ b/src/opts/SkVM_opts.h
@ -37,6 +37,22 @@ static inline skvx::Vec<N,int16_t> mul_q14(const skvx::Vec<N,int16_t>& x,
                                skvx::cast<int>(y) + 0x4000)>>15 ) <<1;
 }
 template <int N>
 static inline skvx::Vec<N,int> gather32(const int* ptr, const skvx::Vec<N,int>& ix) {
 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
    if constexpr (N == 8) {
        return skvx::bit_pun<skvx::Vec<N,int>>(
                _mm256_i32gather_epi32(ptr, skvx::bit_pun<__m256i>(ix), 4));
    }
 #endif
    // Try to recurse on specializations, falling back on standard scalar map()-based impl.
    if constexpr (N > 8) {
        return join(gather32(ptr, ix.lo),
                    gather32(ptr, ix.hi));
    }
    return map(ix, [&](int i) { return ptr[i]; });
 }
 namespace SK_OPTS_NS {
    inline void interpret_skvm(const skvm::InterpreterInstruction insts[], const int ninsts,
@ -180,7 +196,7 @@ namespace SK_OPTS_NS {
                    STRIDE_K(Op::gather32): {
                        const int* ptr;
                        memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
-                        r[d].i32 = map(r[x].i32, [&](int ix) { return ptr[ix]; });
+                        r[d].i32 = gather32(ptr, r[x].i32);
                    } break;
                #undef STRIDE_1