Reland "Reland "gather8/16 JIT support""
This is a reland of1283d55f35
... this time, also checking for HSW feature set. Original change's description: > Reland "gather8/16 JIT support" > > This is a reland of54659e51bc
> > ... now expecting not to JIT when under ASAN/MSAN. > > Original change's description: > > gather8/16 JIT support > > > > The basic strategy is one at a time, inserting 8- or 16-bit values > > into an Xmm register, then expanding to 32-bit in a Ymm at the end > > using vpmovzx{b,w}d instructions. > > > > Somewhat annoyingly we can only pull indices from an Xmm register, > > so we grab the first four then shift down the top before the rest. > > > > Added a unit test to get coverage where the indices are reused and > > not consumed directly by the gather instruction. It's an important > > case, needing to find another register for accum that can't just be > > dst(), but there's no natural coverage of that anywhere. > > > > Change-Id: I8189ead2364060f10537a2f9364d63338a7e596f > > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284311 > > Reviewed-by: Herb Derby <herb@google.com> > > Commit-Queue: Mike Klein <mtklein@google.com> > > Change-Id: I67f441615b312b47e7a3182e85e0f787286d7717 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284472 > Reviewed-by: Herb Derby <herb@google.com> > Commit-Queue: Mike Klein <mtklein@google.com> Change-Id: Id0e53ab67f7a70fe42dccca1d9912b07ec11b54d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284504 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
1174cf82f3
commit
21e85eb549
@ -36,18 +36,6 @@
|
||||
|
||||
bool gSkVMJITViaDylib{false};
|
||||
|
||||
// JIT code isn't MSAN-instrumented, so we won't see when it uses
|
||||
// uninitialized memory, and we'll not see the writes it makes as properly
|
||||
// initializing memory. Instead force the interpreter, which should let
|
||||
// MSAN see everything our programs do properly.
|
||||
//
|
||||
// Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
|
||||
#if defined(__has_feature)
|
||||
#if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
|
||||
#undef SKVM_JIT
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SKVM_JIT)
|
||||
#include <dlfcn.h> // dlopen, dlsym
|
||||
#include <sys/mman.h> // mmap, mprotect
|
||||
@ -1999,6 +1987,14 @@ namespace skvm {
|
||||
this->byte(imm);
|
||||
}
|
||||
|
||||
void Assembler::vextracti128(Operand dst, Ymm src, int imm) {
|
||||
this->op(0x66,0x3a0f,0x39, src,dst);
|
||||
this->byte(imm);
|
||||
}
|
||||
void Assembler::vpextrd(Operand dst, Xmm src, int imm) {
|
||||
this->op(0x66,0x3a0f,0x16, src,dst);
|
||||
this->byte(imm);
|
||||
}
|
||||
void Assembler::vpextrw(Operand dst, Xmm src, int imm) {
|
||||
this->op(0x66,0x3a0f,0x15, src,dst);
|
||||
this->byte(imm);
|
||||
@ -3010,12 +3006,6 @@ namespace skvm {
|
||||
//
|
||||
// Now let's actually assemble the instruction!
|
||||
switch (op) {
|
||||
default:
|
||||
if (debug_dump()) {
|
||||
SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
|
||||
}
|
||||
return false; // TODO: many new ops
|
||||
|
||||
#if defined(__x86_64__)
|
||||
case Op::assert_true: {
|
||||
a->vptest (r[x], &constants[0xffffffff].label);
|
||||
@ -3060,10 +3050,76 @@ namespace skvm {
|
||||
else { a->vmovups( dst(), A::Mem{arg[immy]}); }
|
||||
break;
|
||||
|
||||
case Op::gather8: {
|
||||
A::GP64 base = scratch,
|
||||
index = scratch2;
|
||||
|
||||
// As usual, the gather base pointer is immz bytes off of uniform immy.
|
||||
a->mov(base, A::Mem{arg[immy], immz});
|
||||
|
||||
// We'll need two distinct temporary vector registers:
|
||||
// - tmp() to hold our indices;
|
||||
// - accum to hold our partial gathered result.
|
||||
a->vmovdqa(tmp(), r[x]);
|
||||
|
||||
// accum can be any register, even dst(), as long as it's not the same as tmp().
|
||||
A::Xmm accum;
|
||||
if (dst() != tmp()) {
|
||||
accum = (A::Xmm)dst();
|
||||
} else if (int found = __builtin_ffs(avail & ~(1<<tmp()))) {
|
||||
accum = (A::Xmm)(found-1);
|
||||
} else {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
SkASSERT((A::Xmm)tmp() != accum);
|
||||
|
||||
for (int i = 0; i < (scalar ? 1 : 8); i++) {
|
||||
if (i == 4) {
|
||||
// vpextrd can only pluck indices out from an Xmm register,
|
||||
// so we manually swap over to the top when we're halfway through.
|
||||
a->vextracti128((A::Xmm)tmp(), tmp(), 1);
|
||||
}
|
||||
a->vpextrd(index, (A::Xmm)tmp(), i%4);
|
||||
a->vpinsrb(accum, accum, A::Mem{base,0,index,A::ONE}, i);
|
||||
}
|
||||
a->vpmovzxbd(dst(), accum);
|
||||
} break;
|
||||
|
||||
case Op::gather16: {
|
||||
// Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd.
|
||||
A::GP64 base = scratch,
|
||||
index = scratch2;
|
||||
|
||||
a->mov(base, A::Mem{arg[immy], immz});
|
||||
|
||||
a->vmovdqa(tmp(), r[x]);
|
||||
|
||||
A::Xmm accum;
|
||||
if (dst() != tmp()) {
|
||||
accum = (A::Xmm)dst();
|
||||
} else if (int found = __builtin_ffs(avail & ~(1<<tmp()))) {
|
||||
accum = (A::Xmm)(found-1);
|
||||
} else {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
SkASSERT((A::Xmm)tmp() != accum);
|
||||
|
||||
for (int i = 0; i < (scalar ? 1 : 8); i++) {
|
||||
if (i == 4) {
|
||||
a->vextracti128((A::Xmm)tmp(), tmp(), 1);
|
||||
}
|
||||
a->vpextrd(index, (A::Xmm)tmp(), i%4);
|
||||
a->vpinsrw(accum, accum, A::Mem{base,0,index,A::TWO}, i);
|
||||
}
|
||||
a->vpmovzxwd(dst(), accum);
|
||||
} break;
|
||||
|
||||
case Op::gather32:
|
||||
if (scalar) {
|
||||
auto base = scratch,
|
||||
index = scratch2;
|
||||
A::GP64 base = scratch,
|
||||
index = scratch2;
|
||||
// Our gather base pointer is immz bytes off of uniform immy.
|
||||
a->mov(base, A::Mem{arg[immy], immz});
|
||||
|
||||
@ -3100,7 +3156,7 @@ namespace skvm {
|
||||
}
|
||||
|
||||
// Our gather base pointer is immz bytes off of uniform immy.
|
||||
auto base = scratch;
|
||||
A::GP64 base = scratch;
|
||||
a->mov(base, A::Mem{arg[immy], immz});
|
||||
a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.)
|
||||
a->vgatherdps(dst(), A::FOUR, index, base, mask);
|
||||
@ -3207,6 +3263,12 @@ namespace skvm {
|
||||
case Op::round : a->vcvtps2dq (dst(), r[x]); break;
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
default:
|
||||
if (debug_dump()) {
|
||||
SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
|
||||
}
|
||||
return false; // TODO: many new ops
|
||||
|
||||
case Op::assert_true: {
|
||||
a->uminv4s(tmp(), r[x]); // uminv acts like an all() across the vector.
|
||||
a->fmovs(scratch, tmp());
|
||||
|
@ -23,6 +23,18 @@ class SkWStream;
|
||||
#define SKVM_LLVM
|
||||
#endif
|
||||
|
||||
// JIT code isn't MSAN-instrumented, so we won't see when it uses
|
||||
// uninitialized memory, and we'll not see the writes it makes as properly
|
||||
// initializing memory. Instead force the interpreter, which should let
|
||||
// MSAN see everything our programs do properly.
|
||||
//
|
||||
// Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
|
||||
#if defined(__has_feature)
|
||||
#if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
|
||||
#undef SKVM_JIT
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace skvm {
|
||||
|
||||
bool fma_supported();
|
||||
@ -187,8 +199,10 @@ namespace skvm {
|
||||
void vpinsrw(Xmm dst, Xmm src, Operand y, int imm); // dst = src; dst[imm] = y, 16-bit
|
||||
void vpinsrb(Xmm dst, Xmm src, Operand y, int imm); // dst = src; dst[imm] = y, 8-bit
|
||||
|
||||
void vpextrw(Operand dst, Xmm src, int imm); // dst = src[imm] , 16-bit
|
||||
void vpextrb(Operand dst, Xmm src, int imm); // dst = src[imm] , 8-bit
|
||||
void vextracti128(Operand dst, Ymm src, int imm); // dst = src[imm], 128-bit
|
||||
void vpextrd (Operand dst, Xmm src, int imm); // dst = src[imm], 32-bit
|
||||
void vpextrw (Operand dst, Xmm src, int imm); // dst = src[imm], 16-bit
|
||||
void vpextrb (Operand dst, Xmm src, int imm); // dst = src[imm], 8-bit
|
||||
|
||||
// if (mask & 0x8000'0000) {
|
||||
// dst = base[scale*ix];
|
||||
|
@ -36,10 +36,10 @@ template <typename Fn>
|
||||
static void test_jit_and_interpreter(skvm::Program&& program, Fn&& test) {
|
||||
#if defined(SKVM_LLVM)
|
||||
SkASSERT(program.hasJIT());
|
||||
#elif defined(SKVM_JIT) && defined(SK_CPU_X86) // soon!
|
||||
// SkASSERT(program.hasJIT());
|
||||
#elif defined(SKVM_JIT) && defined(SK_CPU_X86)
|
||||
SkASSERT(program.hasJIT() == SkCpu::Supports(SkCpu::HSW));
|
||||
#elif defined(SKVM_JIT) // eventually!
|
||||
// SkASSERT(program.hasJIT());
|
||||
//SkASSERT(program.hasJIT());
|
||||
#else
|
||||
SkASSERT(!program.hasJIT());
|
||||
#endif
|
||||
@ -439,6 +439,55 @@ DEF_TEST(SkVM_gathers, r) {
|
||||
});
|
||||
}
|
||||
|
||||
DEF_TEST(SkVM_gathers2, r) {
|
||||
skvm::Builder b;
|
||||
{
|
||||
skvm::Arg uniforms = b.uniform(),
|
||||
buf32 = b.varying<int>(),
|
||||
buf16 = b.varying<uint16_t>(),
|
||||
buf8 = b.varying<uint8_t>();
|
||||
|
||||
skvm::I32 x = b.load32(buf32);
|
||||
|
||||
b.store32(buf32, b.gather32(uniforms,0, x));
|
||||
b.store16(buf16, b.gather16(uniforms,0, x));
|
||||
b.store8 (buf8 , b.gather8 (uniforms,0, x));
|
||||
}
|
||||
|
||||
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
|
||||
uint8_t img[256];
|
||||
for (int i = 0; i < 256; i++) {
|
||||
img[i] = i;
|
||||
}
|
||||
|
||||
int buf32[64];
|
||||
uint16_t buf16[64];
|
||||
uint8_t buf8 [64];
|
||||
|
||||
for (int i = 0; i < 64; i++) {
|
||||
buf32[i] = (i*47)&63;
|
||||
buf16[i] = 0;
|
||||
buf8 [i] = 0;
|
||||
}
|
||||
|
||||
struct Uniforms {
|
||||
const uint8_t* img;
|
||||
} uniforms{img};
|
||||
|
||||
program.eval(64, &uniforms, buf32, buf16, buf8);
|
||||
|
||||
for (int i = 0; i < 64; i++) {
|
||||
REPORTER_ASSERT(r, buf8[i] == ((i*47)&63)); // 0,47,30,13,60,...
|
||||
}
|
||||
|
||||
REPORTER_ASSERT(r, buf16[ 0] == 0x0100);
|
||||
REPORTER_ASSERT(r, buf16[63] == 0x2322);
|
||||
|
||||
REPORTER_ASSERT(r, buf32[ 0] == 0x03020100);
|
||||
REPORTER_ASSERT(r, buf32[63] == 0x47464544);
|
||||
});
|
||||
}
|
||||
|
||||
DEF_TEST(SkVM_bitops, r) {
|
||||
skvm::Builder b;
|
||||
{
|
||||
@ -1418,6 +1467,12 @@ DEF_TEST(SkVM_Assembler, r) {
|
||||
a.vpinsrb(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4); // vpinsrb $4, (%rsi), %xmm8, %xmm1
|
||||
a.vpinsrb(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12); // vpinsrb $4, (%rsi), %xmm8, %xmm1
|
||||
|
||||
a.vextracti128(A::xmm1, A::ymm8, 1); // vextracti128 $1, %ymm8, %xmm1
|
||||
a.vextracti128(A::xmm8, A::ymm1, 0); // vextracti128 $0, %ymm1, %xmm8
|
||||
|
||||
a.vpextrd(A::Mem{A::rsi}, A::xmm8, 3); // vpextrd $3, %xmm8, (%rsi)
|
||||
a.vpextrd(A::Mem{A::r8 }, A::xmm1, 2); // vpextrd $2, %xmm1, (%r8)
|
||||
|
||||
a.vpextrw(A::Mem{A::rsi}, A::xmm8, 7);
|
||||
a.vpextrw(A::Mem{A::r8 }, A::xmm1, 15);
|
||||
|
||||
@ -1430,6 +1485,12 @@ DEF_TEST(SkVM_Assembler, r) {
|
||||
0xc4,0xe3,0x39, 0x20, 0x0e, 4,
|
||||
0xc4,0x43,0x71, 0x20, 0x00, 12,
|
||||
|
||||
0xc4,0x63,0x7d,0x39,0xc1, 1,
|
||||
0xc4,0xc3,0x7d,0x39,0xc8, 0,
|
||||
|
||||
0xc4,0x63,0x79,0x16,0x06, 3,
|
||||
0xc4,0xc3,0x79,0x16,0x08, 2,
|
||||
|
||||
0xc4,0x63,0x79, 0x15, 0x06, 7,
|
||||
0xc4,0xc3,0x79, 0x15, 0x08, 15,
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user