move SkVM interpreter back to SkVM.cpp

No real pressing reason to have an AVX2-specialized interpreter now that
we've got an AVX2 JIT.  Keeping things centralized makes it easier to
keep track of and helps reduce dependendcies on the rest of Skia, which
in turn makes it easier to develop SkVM on wimpy machines like an RPi.

Change-Id: Ic4729603ec6c9141929b3e2a56ba380240ea5e0d
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/224822
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Mike Klein 2019-07-01 11:45:13 -05:00 committed by Skia Commit-Bot
parent 1d43530aa2
commit 1e87a5410e
6 changed files with 165 additions and 202 deletions

View File

@ -43,7 +43,6 @@
#include "src/opts/SkRasterPipeline_opts.h"
#include "src/opts/SkSwizzler_opts.h"
#include "src/opts/SkUtils_opts.h"
#include "src/opts/SkVM_opts.h"
#include "src/opts/SkXfermode_opts.h"
namespace SkOpts {
@ -81,8 +80,6 @@ namespace SkOpts {
DEFINE_DEFAULT(hash_fn);
DEFINE_DEFAULT(S32_alpha_D32_filter_DX);
DEFINE_DEFAULT(eval);
#undef DEFINE_DEFAULT
#define M(st) (StageFn)SK_OPTS_NS::st,

View File

@ -14,8 +14,6 @@
struct SkBitmapProcState;
namespace skvm { struct ProgramInstruction; }
namespace SkOpts {
// Call to replace pointers to portable functions with pointers to CPU-specific functions.
// Thread-safe and idempotent.
@ -76,9 +74,6 @@ namespace SkOpts {
extern void (*start_pipeline_lowp )(size_t,size_t,size_t,size_t, void**);
#undef M
extern void (*eval)(const skvm::ProgramInstruction[], int ninsts, int nregs, int loop,
int n, void* args[], size_t strides[], int nargs);
}
#endif//SkOpts_DEFINED

View File

@ -10,7 +10,6 @@
#include "include/private/SkThreadID.h"
#include "include/private/SkVx.h"
#include "src/core/SkCpu.h"
#include "src/core/SkOpts.h"
#include "src/core/SkVM.h"
#include <string.h>
#if defined(SKVM_JIT)
@ -1240,7 +1239,16 @@ namespace skvm {
static SkSpinlock dump_lock;
SkAutoSpinlock lock(dump_lock);
uint32_t hash = SkOpts::hash(fJIT.buf, fJIT.size);
auto fnv1a = [](const void* vbuf, size_t n) {
uint32_t hash = 2166136261;
for (auto buf = (const uint8_t*)vbuf; n --> 0; buf++) {
hash ^= *buf;
hash *= 16777619;
}
return hash;
};
uint32_t hash = fnv1a(fJIT.buf, fJIT.size);
SkString name = SkStringPrintf("skvm-jit-%u", hash);
// Create a jit-<pid>.dump file that we can `perf inject -j` into a
@ -1342,9 +1350,157 @@ namespace skvm {
SkASSERT(arg == args + nargs);
}
}
if (n) {
SkOpts::eval(fInstructions.data(), (int)fInstructions.size(), fRegs, fLoop,
n, args, strides, nargs);
// We'll operate in SIMT style, knocking off K-size chunks from n while possible.
constexpr int K = 16;
using I32 = skvx::Vec<K, int>;
using F32 = skvx::Vec<K, float>;
using U32 = skvx::Vec<K, uint32_t>;
using U8 = skvx::Vec<K, uint8_t>;
using I16x2 = skvx::Vec<2*K, int16_t>;
using U16x2 = skvx::Vec<2*K, uint16_t>;
union Slot {
I32 i32;
U32 u32;
F32 f32;
};
Slot few_regs[16];
std::unique_ptr<char[]> many_regs;
Slot* regs = few_regs;
if (fRegs > (int)SK_ARRAY_COUNT(few_regs)) {
// Annoyingly we can't trust that malloc() or new will work with Slot because
// the skvx::Vec types may have alignment greater than what they provide.
// We'll overallocate one extra register so we can align manually.
many_regs.reset(new char[ sizeof(Slot) * (fRegs + 1) ]);
uintptr_t addr = (uintptr_t)many_regs.get();
addr += alignof(Slot) -
(addr & (alignof(Slot) - 1));
SkASSERT((addr & (alignof(Slot) - 1)) == 0);
regs = (Slot*)addr;
}
auto r = [&](Reg id) -> Slot& {
SkASSERT(0 <= id && id < fRegs);
return regs[id];
};
auto arg = [&](int ix) {
SkASSERT(0 <= ix && ix < nargs);
return args[ix];
};
// Step each argument pointer ahead by its stride a number of times.
auto step_args = [&](int times) {
// Looping by marching pointers until *arg == nullptr helps the
// compiler to keep this loop scalar. Otherwise it'd create a
// rather large and useless autovectorized version.
void** arg = args;
const size_t* stride = strides;
for (; *arg; arg++, stride++) {
*arg = (void*)( (char*)*arg + times * *stride );
}
SkASSERT(arg == args + nargs);
};
int start = 0,
stride;
for ( ; n > 0; start = fLoop, n -= stride, step_args(stride)) {
stride = n >= K ? K : 1;
for (int i = start; i < (int)fInstructions.size(); i++) {
Instruction inst = fInstructions[i];
// d = op(x,y,z/imm)
Reg d = inst.d,
x = inst.x,
y = inst.y,
z = inst.z;
int imm = inst.imm;
// Ops that interact with memory need to know whether we're stride=1 or K,
// but all non-memory ops can run the same code no matter the stride.
switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
#define STRIDE_1(op) case 2*(int)op
#define STRIDE_K(op) case 2*(int)op + 1
STRIDE_1(Op::store8 ): memcpy(arg(imm), &r(x).i32, 1); break;
STRIDE_1(Op::store32): memcpy(arg(imm), &r(x).i32, 4); break;
STRIDE_K(Op::store8 ): skvx::cast<uint8_t>(r(x).i32).store(arg(imm)); break;
STRIDE_K(Op::store32): (r(x).i32).store(arg(imm)); break;
STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 1); break;
STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 4); break;
STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(imm))); break;
STRIDE_K(Op::load32): r(d).i32= I32::Load(arg(imm)) ; break;
#undef STRIDE_1
#undef STRIDE_K
// Ops that don't interact with memory should never care about the stride.
#define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
CASE(Op::splat): r(d).i32 = imm; break;
CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break;
CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
CASE(Op::sub_i16x2):
r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) -
skvx::bit_pun<I16x2>(r(y).i32) ); break;
CASE(Op::mul_i16x2):
r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) *
skvx::bit_pun<I16x2>(r(y).i32) ); break;
CASE(Op::shr_i16x2):
r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<U16x2>(r(x).i32) >> imm);
break;
CASE(Op::bit_and): r(d).i32 = r(x).i32 & r(y).i32; break;
CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break;
CASE(Op::bit_xor): r(d).i32 = r(x).i32 ^ r(y).i32; break;
CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
CASE(Op::shl): r(d).i32 = r(x).i32 << imm; break;
CASE(Op::sra): r(d).i32 = r(x).i32 >> imm; break;
CASE(Op::shr): r(d).u32 = r(x).u32 >> imm; break;
CASE(Op::extract): r(d).u32 = (r(x).u32 >> imm) & r(y).u32; break;
CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << imm); break;
CASE(Op::bytes): {
const U32 table[] = {
0,
(r(x).u32 ) & 0xff,
(r(x).u32 >> 8) & 0xff,
(r(x).u32 >> 16) & 0xff,
(r(x).u32 >> 24) & 0xff,
};
r(d).u32 = table[(imm >> 0) & 0xf] << 0
| table[(imm >> 4) & 0xf] << 8
| table[(imm >> 8) & 0xf] << 16
| table[(imm >> 12) & 0xf] << 24;
} break;
CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
CASE(Op::to_i32): r(d).i32 = skvx::cast<int> (r(x).f32); break;
#undef CASE
}
}
}
}
}
}

View File

@ -192,17 +192,14 @@ namespace skvm {
using Reg = int;
struct ProgramInstruction { // d = op(x, y, z/imm)
class Program {
public:
struct Instruction { // d = op(x, y, z/imm)
Op op;
Reg d,x,y;
union { Reg z; int imm; };
};
class Program {
public:
// Moved outside Program so it can be forward-declared.
using Instruction = ProgramInstruction;
Program(std::vector<Instruction>, int regs, int loop);
Program() : Program({}, 0, 0) {}

View File

@ -11,7 +11,6 @@
#include "src/opts/SkBlitRow_opts.h"
#include "src/opts/SkRasterPipeline_opts.h"
#include "src/opts/SkUtils_opts.h"
#include "src/opts/SkVM_opts.h"
namespace SkOpts {
void Init_hsw() {
@ -29,7 +28,5 @@ namespace SkOpts {
just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
#undef M
eval = hsw::eval;
}
}

View File

@ -1,179 +0,0 @@
/*
* Copyright 2019 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkVM_opts_DEFINED
#define SkVM_opts_DEFINED
#include "src/core/SkVM.h"
#include "include/private/SkVx.h"
namespace SK_OPTS_NS {
inline void eval(const skvm::Program::Instruction insts[], const int ninsts,
const int nregs, const int loop,
int n, void* args[], size_t strides[], const int nargs) {
using namespace skvm;
// We'll operate in SIMT style, knocking off K-size chunks from n while possible.
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
constexpr int K = 32;
#else
constexpr int K = 16;
#endif
using I32 = skvx::Vec<K, int>;
using F32 = skvx::Vec<K, float>;
using U32 = skvx::Vec<K, uint32_t>;
using U8 = skvx::Vec<K, uint8_t>;
using I16x2 = skvx::Vec<2*K, int16_t>;
using U16x2 = skvx::Vec<2*K, uint16_t>;
union Slot {
I32 i32;
U32 u32;
F32 f32;
};
Slot few_regs[16];
std::unique_ptr<char[]> many_regs;
Slot* regs = few_regs;
if (nregs > (int)SK_ARRAY_COUNT(few_regs)) {
// Annoyingly we can't trust that malloc() or new will work with Slot because
// the skvx::Vec types may have alignment greater than what they provide.
// We'll overallocate one extra register so we can align manually.
many_regs.reset(new char[ sizeof(Slot) * (nregs + 1) ]);
uintptr_t addr = (uintptr_t)many_regs.get();
addr += alignof(Slot) -
(addr & (alignof(Slot) - 1));
SkASSERT((addr & (alignof(Slot) - 1)) == 0);
regs = (Slot*)addr;
}
auto r = [&](Reg id) -> Slot& {
SkASSERT(0 <= id && id < nregs);
return regs[id];
};
auto arg = [&](int ix) {
SkASSERT(0 <= ix && ix < nargs);
return args[ix];
};
// Step each argument pointer ahead by its stride a number of times.
auto step_args = [&](int times) {
// Looping by marching pointers until *arg == nullptr helps the
// compiler to keep this loop scalar. Otherwise it'd create a
// rather large and useless autovectorized version.
void** arg = args;
const size_t* stride = strides;
for (; *arg; arg++, stride++) {
*arg = (void*)( (char*)*arg + times * *stride );
}
SkASSERT(arg == args + nargs);
};
int start = 0,
stride;
for ( ; n > 0; start = loop, n -= stride, step_args(stride)) {
stride = n >= K ? K : 1;
for (int i = start; i < ninsts; i++) {
skvm::Program::Instruction inst = insts[i];
// d = op(x,y,z/imm)
Reg d = inst.d,
x = inst.x,
y = inst.y,
z = inst.z;
int imm = inst.imm;
// Ops that interact with memory need to know whether we're stride=1 or stride=K,
// but all non-memory ops can run the same code no matter the stride.
switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
#define STRIDE_1(op) case 2*(int)op
#define STRIDE_K(op) case 2*(int)op + 1
STRIDE_1(Op::store8 ): memcpy(arg(imm), &r(x).i32, 1); break;
STRIDE_1(Op::store32): memcpy(arg(imm), &r(x).i32, 4); break;
STRIDE_K(Op::store8 ): skvx::cast<uint8_t>(r(x).i32).store(arg(imm)); break;
STRIDE_K(Op::store32): (r(x).i32).store(arg(imm)); break;
STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 1); break;
STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 4); break;
STRIDE_K(Op::load8 ): r(d).i32 = skvx::cast<int>(U8 ::Load(arg(imm))); break;
STRIDE_K(Op::load32): r(d).i32 = I32::Load(arg(imm)) ; break;
#undef STRIDE_1
#undef STRIDE_K
// Ops that don't interact with memory should never care about the stride.
#define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
CASE(Op::splat): r(d).i32 = imm; break;
CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break;
CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
CASE(Op::sub_i16x2):
r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) -
skvx::bit_pun<I16x2>(r(y).i32) ); break;
CASE(Op::mul_i16x2):
r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) *
skvx::bit_pun<I16x2>(r(y).i32) ); break;
CASE(Op::shr_i16x2):
r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<U16x2>(r(x).i32) >> imm);
break;
CASE(Op::bit_and): r(d).i32 = r(x).i32 & r(y).i32; break;
CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break;
CASE(Op::bit_xor): r(d).i32 = r(x).i32 ^ r(y).i32; break;
CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
CASE(Op::shl): r(d).i32 = r(x).i32 << imm; break;
CASE(Op::sra): r(d).i32 = r(x).i32 >> imm; break;
CASE(Op::shr): r(d).u32 = r(x).u32 >> imm; break;
CASE(Op::extract): r(d).u32 = (r(x).u32 >> imm) & r(y).u32; break;
CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << imm); break;
CASE(Op::bytes): {
const U32 table[] = {
0,
(r(x).u32 ) & 0xff,
(r(x).u32 >> 8) & 0xff,
(r(x).u32 >> 16) & 0xff,
(r(x).u32 >> 24) & 0xff,
};
r(d).u32 = table[(imm >> 0) & 0xf] << 0
| table[(imm >> 4) & 0xf] << 8
| table[(imm >> 8) & 0xf] << 16
| table[(imm >> 12) & 0xf] << 24;
} break;
CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
CASE(Op::to_i32): r(d).i32 = skvx::cast<int> (r(x).f32); break;
#undef CASE
}
}
}
}
} // namespace SK_OPTS_NS
#endif//SkVM_opts_DEFINED