diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp index 6dd3fcf796..24deb783e3 100644 --- a/src/core/SkOpts.cpp +++ b/src/core/SkOpts.cpp @@ -43,7 +43,6 @@ #include "src/opts/SkRasterPipeline_opts.h" #include "src/opts/SkSwizzler_opts.h" #include "src/opts/SkUtils_opts.h" -#include "src/opts/SkVM_opts.h" #include "src/opts/SkXfermode_opts.h" namespace SkOpts { @@ -81,8 +80,6 @@ namespace SkOpts { DEFINE_DEFAULT(hash_fn); DEFINE_DEFAULT(S32_alpha_D32_filter_DX); - - DEFINE_DEFAULT(eval); #undef DEFINE_DEFAULT #define M(st) (StageFn)SK_OPTS_NS::st, diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h index 129541b8bd..3a4d9504d2 100644 --- a/src/core/SkOpts.h +++ b/src/core/SkOpts.h @@ -14,8 +14,6 @@ struct SkBitmapProcState; -namespace skvm { struct ProgramInstruction; } - namespace SkOpts { // Call to replace pointers to portable functions with pointers to CPU-specific functions. // Thread-safe and idempotent. @@ -76,9 +74,6 @@ namespace SkOpts { extern void (*start_pipeline_lowp )(size_t,size_t,size_t,size_t, void**); #undef M - extern void (*eval)(const skvm::ProgramInstruction[], int ninsts, int nregs, int loop, - int n, void* args[], size_t strides[], int nargs); - } #endif//SkOpts_DEFINED diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp index 23c8e05813..ee58fa4416 100644 --- a/src/core/SkVM.cpp +++ b/src/core/SkVM.cpp @@ -10,7 +10,6 @@ #include "include/private/SkThreadID.h" #include "include/private/SkVx.h" #include "src/core/SkCpu.h" -#include "src/core/SkOpts.h" #include "src/core/SkVM.h" #include #if defined(SKVM_JIT) @@ -1240,7 +1239,16 @@ namespace skvm { static SkSpinlock dump_lock; SkAutoSpinlock lock(dump_lock); - uint32_t hash = SkOpts::hash(fJIT.buf, fJIT.size); + auto fnv1a = [](const void* vbuf, size_t n) { + uint32_t hash = 2166136261; + for (auto buf = (const uint8_t*)vbuf; n --> 0; buf++) { + hash ^= *buf; + hash *= 16777619; + } + return hash; + }; + + uint32_t hash = fnv1a(fJIT.buf, fJIT.size); SkString name = SkStringPrintf("skvm-jit-%u", hash); // Create a jit-.dump file that we can `perf inject -j` into a @@ -1342,9 +1350,157 @@ namespace skvm { SkASSERT(arg == args + nargs); } } + if (n) { - SkOpts::eval(fInstructions.data(), (int)fInstructions.size(), fRegs, fLoop, - n, args, strides, nargs); + // We'll operate in SIMT style, knocking off K-size chunks from n while possible. + constexpr int K = 16; + using I32 = skvx::Vec; + using F32 = skvx::Vec; + using U32 = skvx::Vec; + using U8 = skvx::Vec; + + using I16x2 = skvx::Vec<2*K, int16_t>; + using U16x2 = skvx::Vec<2*K, uint16_t>; + + union Slot { + I32 i32; + U32 u32; + F32 f32; + }; + + Slot few_regs[16]; + std::unique_ptr many_regs; + + Slot* regs = few_regs; + + if (fRegs > (int)SK_ARRAY_COUNT(few_regs)) { + // Annoyingly we can't trust that malloc() or new will work with Slot because + // the skvx::Vec types may have alignment greater than what they provide. + // We'll overallocate one extra register so we can align manually. + many_regs.reset(new char[ sizeof(Slot) * (fRegs + 1) ]); + + uintptr_t addr = (uintptr_t)many_regs.get(); + addr += alignof(Slot) - + (addr & (alignof(Slot) - 1)); + SkASSERT((addr & (alignof(Slot) - 1)) == 0); + regs = (Slot*)addr; + } + + + auto r = [&](Reg id) -> Slot& { + SkASSERT(0 <= id && id < fRegs); + return regs[id]; + }; + auto arg = [&](int ix) { + SkASSERT(0 <= ix && ix < nargs); + return args[ix]; + }; + + // Step each argument pointer ahead by its stride a number of times. + auto step_args = [&](int times) { + // Looping by marching pointers until *arg == nullptr helps the + // compiler to keep this loop scalar. Otherwise it'd create a + // rather large and useless autovectorized version. + void** arg = args; + const size_t* stride = strides; + for (; *arg; arg++, stride++) { + *arg = (void*)( (char*)*arg + times * *stride ); + } + SkASSERT(arg == args + nargs); + }; + + int start = 0, + stride; + for ( ; n > 0; start = fLoop, n -= stride, step_args(stride)) { + stride = n >= K ? K : 1; + + for (int i = start; i < (int)fInstructions.size(); i++) { + Instruction inst = fInstructions[i]; + + // d = op(x,y,z/imm) + Reg d = inst.d, + x = inst.x, + y = inst.y, + z = inst.z; + int imm = inst.imm; + + // Ops that interact with memory need to know whether we're stride=1 or K, + // but all non-memory ops can run the same code no matter the stride. + switch (2*(int)inst.op + (stride == K ? 1 : 0)) { + + #define STRIDE_1(op) case 2*(int)op + #define STRIDE_K(op) case 2*(int)op + 1 + STRIDE_1(Op::store8 ): memcpy(arg(imm), &r(x).i32, 1); break; + STRIDE_1(Op::store32): memcpy(arg(imm), &r(x).i32, 4); break; + + STRIDE_K(Op::store8 ): skvx::cast(r(x).i32).store(arg(imm)); break; + STRIDE_K(Op::store32): (r(x).i32).store(arg(imm)); break; + + STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 1); break; + STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 4); break; + + STRIDE_K(Op::load8 ): r(d).i32= skvx::cast(U8 ::Load(arg(imm))); break; + STRIDE_K(Op::load32): r(d).i32= I32::Load(arg(imm)) ; break; + #undef STRIDE_1 + #undef STRIDE_K + + // Ops that don't interact with memory should never care about the stride. + #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1 + CASE(Op::splat): r(d).i32 = imm; break; + + CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break; + CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break; + CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break; + CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break; + + CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break; + + CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break; + CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break; + CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break; + + CASE(Op::sub_i16x2): + r(d).i32 = skvx::bit_pun(skvx::bit_pun(r(x).i32) - + skvx::bit_pun(r(y).i32) ); break; + CASE(Op::mul_i16x2): + r(d).i32 = skvx::bit_pun(skvx::bit_pun(r(x).i32) * + skvx::bit_pun(r(y).i32) ); break; + CASE(Op::shr_i16x2): + r(d).i32 = skvx::bit_pun(skvx::bit_pun(r(x).i32) >> imm); + break; + + CASE(Op::bit_and): r(d).i32 = r(x).i32 & r(y).i32; break; + CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break; + CASE(Op::bit_xor): r(d).i32 = r(x).i32 ^ r(y).i32; break; + CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break; + + CASE(Op::shl): r(d).i32 = r(x).i32 << imm; break; + CASE(Op::sra): r(d).i32 = r(x).i32 >> imm; break; + CASE(Op::shr): r(d).u32 = r(x).u32 >> imm; break; + + CASE(Op::extract): r(d).u32 = (r(x).u32 >> imm) & r(y).u32; break; + CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << imm); break; + + CASE(Op::bytes): { + const U32 table[] = { + 0, + (r(x).u32 ) & 0xff, + (r(x).u32 >> 8) & 0xff, + (r(x).u32 >> 16) & 0xff, + (r(x).u32 >> 24) & 0xff, + }; + r(d).u32 = table[(imm >> 0) & 0xf] << 0 + | table[(imm >> 4) & 0xf] << 8 + | table[(imm >> 8) & 0xf] << 16 + | table[(imm >> 12) & 0xf] << 24; + } break; + + CASE(Op::to_f32): r(d).f32 = skvx::cast(r(x).i32); break; + CASE(Op::to_i32): r(d).i32 = skvx::cast (r(x).f32); break; + #undef CASE + } + } + } } } } diff --git a/src/core/SkVM.h b/src/core/SkVM.h index 203c1dfb58..8e436253e6 100644 --- a/src/core/SkVM.h +++ b/src/core/SkVM.h @@ -192,16 +192,13 @@ namespace skvm { using Reg = int; - struct ProgramInstruction { // d = op(x, y, z/imm) - Op op; - Reg d,x,y; - union { Reg z; int imm; }; - }; - class Program { public: - // Moved outside Program so it can be forward-declared. - using Instruction = ProgramInstruction; + struct Instruction { // d = op(x, y, z/imm) + Op op; + Reg d,x,y; + union { Reg z; int imm; }; + }; Program(std::vector, int regs, int loop); Program() : Program({}, 0, 0) {} diff --git a/src/opts/SkOpts_hsw.cpp b/src/opts/SkOpts_hsw.cpp index 2a2d814861..fae5c858cd 100644 --- a/src/opts/SkOpts_hsw.cpp +++ b/src/opts/SkOpts_hsw.cpp @@ -11,7 +11,6 @@ #include "src/opts/SkBlitRow_opts.h" #include "src/opts/SkRasterPipeline_opts.h" #include "src/opts/SkUtils_opts.h" -#include "src/opts/SkVM_opts.h" namespace SkOpts { void Init_hsw() { @@ -29,7 +28,5 @@ namespace SkOpts { just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return; start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline; #undef M - - eval = hsw::eval; } } diff --git a/src/opts/SkVM_opts.h b/src/opts/SkVM_opts.h deleted file mode 100644 index 0655960a2c..0000000000 --- a/src/opts/SkVM_opts.h +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright 2019 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#ifndef SkVM_opts_DEFINED -#define SkVM_opts_DEFINED - -#include "src/core/SkVM.h" - -#include "include/private/SkVx.h" - -namespace SK_OPTS_NS { - - inline void eval(const skvm::Program::Instruction insts[], const int ninsts, - const int nregs, const int loop, - int n, void* args[], size_t strides[], const int nargs) { - using namespace skvm; - - // We'll operate in SIMT style, knocking off K-size chunks from n while possible. - #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 - constexpr int K = 32; - #else - constexpr int K = 16; - #endif - using I32 = skvx::Vec; - using F32 = skvx::Vec; - using U32 = skvx::Vec; - using U8 = skvx::Vec; - - using I16x2 = skvx::Vec<2*K, int16_t>; - using U16x2 = skvx::Vec<2*K, uint16_t>; - - union Slot { - I32 i32; - U32 u32; - F32 f32; - }; - - Slot few_regs[16]; - std::unique_ptr many_regs; - - Slot* regs = few_regs; - - if (nregs > (int)SK_ARRAY_COUNT(few_regs)) { - // Annoyingly we can't trust that malloc() or new will work with Slot because - // the skvx::Vec types may have alignment greater than what they provide. - // We'll overallocate one extra register so we can align manually. - many_regs.reset(new char[ sizeof(Slot) * (nregs + 1) ]); - - uintptr_t addr = (uintptr_t)many_regs.get(); - addr += alignof(Slot) - - (addr & (alignof(Slot) - 1)); - SkASSERT((addr & (alignof(Slot) - 1)) == 0); - regs = (Slot*)addr; - } - - - auto r = [&](Reg id) -> Slot& { - SkASSERT(0 <= id && id < nregs); - return regs[id]; - }; - auto arg = [&](int ix) { - SkASSERT(0 <= ix && ix < nargs); - return args[ix]; - }; - - // Step each argument pointer ahead by its stride a number of times. - auto step_args = [&](int times) { - // Looping by marching pointers until *arg == nullptr helps the - // compiler to keep this loop scalar. Otherwise it'd create a - // rather large and useless autovectorized version. - void** arg = args; - const size_t* stride = strides; - for (; *arg; arg++, stride++) { - *arg = (void*)( (char*)*arg + times * *stride ); - } - SkASSERT(arg == args + nargs); - }; - - int start = 0, - stride; - for ( ; n > 0; start = loop, n -= stride, step_args(stride)) { - stride = n >= K ? K : 1; - - for (int i = start; i < ninsts; i++) { - skvm::Program::Instruction inst = insts[i]; - - // d = op(x,y,z/imm) - Reg d = inst.d, - x = inst.x, - y = inst.y, - z = inst.z; - int imm = inst.imm; - - // Ops that interact with memory need to know whether we're stride=1 or stride=K, - // but all non-memory ops can run the same code no matter the stride. - switch (2*(int)inst.op + (stride == K ? 1 : 0)) { - - #define STRIDE_1(op) case 2*(int)op - #define STRIDE_K(op) case 2*(int)op + 1 - STRIDE_1(Op::store8 ): memcpy(arg(imm), &r(x).i32, 1); break; - STRIDE_1(Op::store32): memcpy(arg(imm), &r(x).i32, 4); break; - - STRIDE_K(Op::store8 ): skvx::cast(r(x).i32).store(arg(imm)); break; - STRIDE_K(Op::store32): (r(x).i32).store(arg(imm)); break; - - STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 1); break; - STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 4); break; - - STRIDE_K(Op::load8 ): r(d).i32 = skvx::cast(U8 ::Load(arg(imm))); break; - STRIDE_K(Op::load32): r(d).i32 = I32::Load(arg(imm)) ; break; - #undef STRIDE_1 - #undef STRIDE_K - - // Ops that don't interact with memory should never care about the stride. - #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1 - CASE(Op::splat): r(d).i32 = imm; break; - - CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break; - CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break; - CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break; - CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break; - - CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break; - - CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break; - CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break; - CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break; - - CASE(Op::sub_i16x2): - r(d).i32 = skvx::bit_pun(skvx::bit_pun(r(x).i32) - - skvx::bit_pun(r(y).i32) ); break; - CASE(Op::mul_i16x2): - r(d).i32 = skvx::bit_pun(skvx::bit_pun(r(x).i32) * - skvx::bit_pun(r(y).i32) ); break; - CASE(Op::shr_i16x2): - r(d).i32 = skvx::bit_pun(skvx::bit_pun(r(x).i32) >> imm); - break; - - CASE(Op::bit_and): r(d).i32 = r(x).i32 & r(y).i32; break; - CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break; - CASE(Op::bit_xor): r(d).i32 = r(x).i32 ^ r(y).i32; break; - CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break; - - CASE(Op::shl): r(d).i32 = r(x).i32 << imm; break; - CASE(Op::sra): r(d).i32 = r(x).i32 >> imm; break; - CASE(Op::shr): r(d).u32 = r(x).u32 >> imm; break; - - CASE(Op::extract): r(d).u32 = (r(x).u32 >> imm) & r(y).u32; break; - CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << imm); break; - - CASE(Op::bytes): { - const U32 table[] = { - 0, - (r(x).u32 ) & 0xff, - (r(x).u32 >> 8) & 0xff, - (r(x).u32 >> 16) & 0xff, - (r(x).u32 >> 24) & 0xff, - }; - r(d).u32 = table[(imm >> 0) & 0xf] << 0 - | table[(imm >> 4) & 0xf] << 8 - | table[(imm >> 8) & 0xf] << 16 - | table[(imm >> 12) & 0xf] << 24; - } break; - - CASE(Op::to_f32): r(d).f32 = skvx::cast(r(x).i32); break; - CASE(Op::to_i32): r(d).i32 = skvx::cast (r(x).f32); break; - #undef CASE - } - } - } - } - -} // namespace SK_OPTS_NS - -#endif//SkVM_opts_DEFINED