gather8/16 JIT support

The basic strategy is one at a time, inserting 8- or 16-bit values
into an Xmm register, then expanding to 32-bit in a Ymm at the end
using vpmovzx{b,w}d instructions.

Somewhat annoyingly we can only pull indices from an Xmm register,
so we grab the first four then shift down the top before the rest.

Added a unit test to get coverage where the indices are reused and
not consumed directly by the gather instruction.  It's an important
case, needing to find another register for accum that can't just be
dst(), but there's no natural coverage of that anywhere.

Change-Id: I8189ead2364060f10537a2f9364d63338a7e596f
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284311
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Mike Klein 2020-04-17 13:57:13 -05:00 committed by Skia Commit-Bot
parent f95e2f42a8
commit 54659e51bc
3 changed files with 149 additions and 12 deletions

View File

@ -1999,6 +1999,14 @@ namespace skvm {
this->byte(imm);
}
void Assembler::vextracti128(Operand dst, Ymm src, int imm) {
this->op(0x66,0x3a0f,0x39, src,dst);
this->byte(imm);
}
void Assembler::vpextrd(Operand dst, Xmm src, int imm) {
this->op(0x66,0x3a0f,0x16, src,dst);
this->byte(imm);
}
void Assembler::vpextrw(Operand dst, Xmm src, int imm) {
this->op(0x66,0x3a0f,0x15, src,dst);
this->byte(imm);
@ -3010,12 +3018,6 @@ namespace skvm {
//
// Now let's actually assemble the instruction!
switch (op) {
default:
if (debug_dump()) {
SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
}
return false; // TODO: many new ops
#if defined(__x86_64__)
case Op::assert_true: {
a->vptest (r[x], &constants[0xffffffff].label);
@ -3060,10 +3062,76 @@ namespace skvm {
else { a->vmovups( dst(), A::Mem{arg[immy]}); }
break;
case Op::gather8: {
A::GP64 base = scratch,
index = scratch2;
// As usual, the gather base pointer is immz bytes off of uniform immy.
a->mov(base, A::Mem{arg[immy], immz});
// We'll need two distinct temporary vector registers:
// - tmp() to hold our indices;
// - accum to hold our partial gathered result.
a->vmovdqa(tmp(), r[x]);
// accum can be any register, even dst(), as long as it's not the same as tmp().
A::Xmm accum;
if (dst() != tmp()) {
accum = (A::Xmm)dst();
} else if (int found = __builtin_ffs(avail & ~(1<<tmp()))) {
accum = (A::Xmm)(found-1);
} else {
ok = false;
break;
}
SkASSERT((A::Xmm)tmp() != accum);
for (int i = 0; i < (scalar ? 1 : 8); i++) {
if (i == 4) {
// vpextrd can only pluck indices out from an Xmm register,
// so we manually swap over to the top when we're halfway through.
a->vextracti128((A::Xmm)tmp(), tmp(), 1);
}
a->vpextrd(index, (A::Xmm)tmp(), i%4);
a->vpinsrb(accum, accum, A::Mem{base,0,index,A::ONE}, i);
}
a->vpmovzxbd(dst(), accum);
} break;
case Op::gather16: {
// Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd.
A::GP64 base = scratch,
index = scratch2;
a->mov(base, A::Mem{arg[immy], immz});
a->vmovdqa(tmp(), r[x]);
A::Xmm accum;
if (dst() != tmp()) {
accum = (A::Xmm)dst();
} else if (int found = __builtin_ffs(avail & ~(1<<tmp()))) {
accum = (A::Xmm)(found-1);
} else {
ok = false;
break;
}
SkASSERT((A::Xmm)tmp() != accum);
for (int i = 0; i < (scalar ? 1 : 8); i++) {
if (i == 4) {
a->vextracti128((A::Xmm)tmp(), tmp(), 1);
}
a->vpextrd(index, (A::Xmm)tmp(), i%4);
a->vpinsrw(accum, accum, A::Mem{base,0,index,A::TWO}, i);
}
a->vpmovzxwd(dst(), accum);
} break;
case Op::gather32:
if (scalar) {
auto base = scratch,
index = scratch2;
A::GP64 base = scratch,
index = scratch2;
// Our gather base pointer is immz bytes off of uniform immy.
a->mov(base, A::Mem{arg[immy], immz});
@ -3100,7 +3168,7 @@ namespace skvm {
}
// Our gather base pointer is immz bytes off of uniform immy.
auto base = scratch;
A::GP64 base = scratch;
a->mov(base, A::Mem{arg[immy], immz});
a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.)
a->vgatherdps(dst(), A::FOUR, index, base, mask);
@ -3207,6 +3275,12 @@ namespace skvm {
case Op::round : a->vcvtps2dq (dst(), r[x]); break;
#elif defined(__aarch64__)
default:
if (debug_dump()) {
SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
}
return false; // TODO: many new ops
case Op::assert_true: {
a->uminv4s(tmp(), r[x]); // uminv acts like an all() across the vector.
a->fmovs(scratch, tmp());

View File

@ -187,8 +187,10 @@ namespace skvm {
void vpinsrw(Xmm dst, Xmm src, Operand y, int imm); // dst = src; dst[imm] = y, 16-bit
void vpinsrb(Xmm dst, Xmm src, Operand y, int imm); // dst = src; dst[imm] = y, 8-bit
void vpextrw(Operand dst, Xmm src, int imm); // dst = src[imm] , 16-bit
void vpextrb(Operand dst, Xmm src, int imm); // dst = src[imm] , 8-bit
void vextracti128(Operand dst, Ymm src, int imm); // dst = src[imm], 128-bit
void vpextrd (Operand dst, Xmm src, int imm); // dst = src[imm], 32-bit
void vpextrw (Operand dst, Xmm src, int imm); // dst = src[imm], 16-bit
void vpextrb (Operand dst, Xmm src, int imm); // dst = src[imm], 8-bit
// if (mask & 0x8000'0000) {
// dst = base[scale*ix];

View File

@ -37,7 +37,7 @@ static void test_jit_and_interpreter(skvm::Program&& program, Fn&& test) {
#if defined(SKVM_LLVM)
SkASSERT(program.hasJIT());
#elif defined(SKVM_JIT) && defined(SK_CPU_X86) // soon!
// SkASSERT(program.hasJIT());
SkASSERT(program.hasJIT());
#elif defined(SKVM_JIT) // eventually!
// SkASSERT(program.hasJIT());
#else
@ -439,6 +439,55 @@ DEF_TEST(SkVM_gathers, r) {
});
}
DEF_TEST(SkVM_gathers2, r) {
skvm::Builder b;
{
skvm::Arg uniforms = b.uniform(),
buf32 = b.varying<int>(),
buf16 = b.varying<uint16_t>(),
buf8 = b.varying<uint8_t>();
skvm::I32 x = b.load32(buf32);
b.store32(buf32, b.gather32(uniforms,0, x));
b.store16(buf16, b.gather16(uniforms,0, x));
b.store8 (buf8 , b.gather8 (uniforms,0, x));
}
test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
uint8_t img[256];
for (int i = 0; i < 256; i++) {
img[i] = i;
}
int buf32[64];
uint16_t buf16[64];
uint8_t buf8 [64];
for (int i = 0; i < 64; i++) {
buf32[i] = (i*47)&63;
buf16[i] = 0;
buf8 [i] = 0;
}
struct Uniforms {
const uint8_t* img;
} uniforms{img};
program.eval(64, &uniforms, buf32, buf16, buf8);
for (int i = 0; i < 64; i++) {
REPORTER_ASSERT(r, buf8[i] == ((i*47)&63)); // 0,47,30,13,60,...
}
REPORTER_ASSERT(r, buf16[ 0] == 0x0100);
REPORTER_ASSERT(r, buf16[63] == 0x2322);
REPORTER_ASSERT(r, buf32[ 0] == 0x03020100);
REPORTER_ASSERT(r, buf32[63] == 0x47464544);
});
}
DEF_TEST(SkVM_bitops, r) {
skvm::Builder b;
{
@ -1418,6 +1467,12 @@ DEF_TEST(SkVM_Assembler, r) {
a.vpinsrb(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4); // vpinsrb $4, (%rsi), %xmm8, %xmm1
a.vpinsrb(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12); // vpinsrb $4, (%rsi), %xmm8, %xmm1
a.vextracti128(A::xmm1, A::ymm8, 1); // vextracti128 $1, %ymm8, %xmm1
a.vextracti128(A::xmm8, A::ymm1, 0); // vextracti128 $0, %ymm1, %xmm8
a.vpextrd(A::Mem{A::rsi}, A::xmm8, 3); // vpextrd $3, %xmm8, (%rsi)
a.vpextrd(A::Mem{A::r8 }, A::xmm1, 2); // vpextrd $2, %xmm1, (%r8)
a.vpextrw(A::Mem{A::rsi}, A::xmm8, 7);
a.vpextrw(A::Mem{A::r8 }, A::xmm1, 15);
@ -1430,6 +1485,12 @@ DEF_TEST(SkVM_Assembler, r) {
0xc4,0xe3,0x39, 0x20, 0x0e, 4,
0xc4,0x43,0x71, 0x20, 0x00, 12,
0xc4,0x63,0x7d,0x39,0xc1, 1,
0xc4,0xc3,0x7d,0x39,0xc8, 0,
0xc4,0x63,0x79,0x16,0x06, 3,
0xc4,0xc3,0x79,0x16,0x08, 2,
0xc4,0x63,0x79, 0x15, 0x06, 7,
0xc4,0xc3,0x79, 0x15, 0x08, 15,