avx2 specialization for gather32

This wraps up my recent interpreter optimization push.
I've been informally measuring my progress by running

    viewer --slide GM_color_cube_rt --skvm --nojit --stats

The frame cost has dropped from 13-14ms to just under 10ms;
the JIT's right around 3ms.

Change-Id: Ieff977ac7777dbbf15c8c3b7e40a0031fb95fe7d
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/317718
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Mike Klein 2020-09-18 08:02:46 -05:00 committed by Skia Commit-Bot
parent 6798e5d0d1
commit 2e69a13d83

View File

@ -37,6 +37,22 @@ static inline skvx::Vec<N,int16_t> mul_q14(const skvx::Vec<N,int16_t>& x,
skvx::cast<int>(y) + 0x4000)>>15 ) <<1; skvx::cast<int>(y) + 0x4000)>>15 ) <<1;
} }
template <int N>
static inline skvx::Vec<N,int> gather32(const int* ptr, const skvx::Vec<N,int>& ix) {
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
if constexpr (N == 8) {
return skvx::bit_pun<skvx::Vec<N,int>>(
_mm256_i32gather_epi32(ptr, skvx::bit_pun<__m256i>(ix), 4));
}
#endif
// Try to recurse on specializations, falling back on standard scalar map()-based impl.
if constexpr (N > 8) {
return join(gather32(ptr, ix.lo),
gather32(ptr, ix.hi));
}
return map(ix, [&](int i) { return ptr[i]; });
}
namespace SK_OPTS_NS { namespace SK_OPTS_NS {
inline void interpret_skvm(const skvm::InterpreterInstruction insts[], const int ninsts, inline void interpret_skvm(const skvm::InterpreterInstruction insts[], const int ninsts,
@ -180,7 +196,7 @@ namespace SK_OPTS_NS {
STRIDE_K(Op::gather32): { STRIDE_K(Op::gather32): {
const int* ptr; const int* ptr;
memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr)); memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr));
r[d].i32 = map(r[x].i32, [&](int ix) { return ptr[ix]; }); r[d].i32 = gather32(ptr, r[x].i32);
} break; } break;
#undef STRIDE_1 #undef STRIDE_1