impl gather32 for x86
Some TODOs left over to make the scalar tail case better... as is it issues a 256-bit gather for each 32-bit load! I added a trimmed down variant of the existing SkVM_gathers unit test to test just gather32, covering this new JIT code. Change-Id: Iabd2e6a61f0213b6d02d222b9f7aec2be000b70b Reviewed-on: https://skia-review.googlesource.com/c/skia/+/264217 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
118df7cf62
commit
b2b6a99dca
@ -2307,6 +2307,43 @@ namespace skvm {
|
||||
else { a->vmovups( dst(), arg[immy]); }
|
||||
break;
|
||||
|
||||
case Op::gather32: {
|
||||
// We may not let any of dst(), index, or mask use the same register,
|
||||
// so we must allocate registers manually and very carefully.
|
||||
|
||||
// index is argument x and has already been maybe_recycle_register()'d,
|
||||
// so we explicitly ignore its availability during this op.
|
||||
A::Ymm index = r[x];
|
||||
uint32_t avail_during_gather = avail & ~(1<<index);
|
||||
|
||||
// Choose dst() to not overlap with index.
|
||||
if (int found = __builtin_ffs(avail_during_gather)) {
|
||||
set_dst((A::Ymm)(found-1));
|
||||
avail_during_gather ^= (1<<dst());
|
||||
} else {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// Choose (temporary) mask to not overlap with dst() or index.
|
||||
A::Ymm mask;
|
||||
if (int found = __builtin_ffs(avail_during_gather)) {
|
||||
mask = (A::Ymm)(found-1);
|
||||
} else {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// Our gather base pointer is immz bytes off of uniform immy.
|
||||
a->movq(scratch, arg[immy], immz);
|
||||
a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.)
|
||||
a->vgatherdps(dst(), A::FOUR, index, scratch, mask);
|
||||
|
||||
// TODO: simpler impl. when scalar == true?
|
||||
// TODO: at least disable the other mask lanes?
|
||||
}
|
||||
break;
|
||||
|
||||
case Op::uniform8: a->movzbl(scratch, arg[immy], immz);
|
||||
a->vmovd_direct((A::Xmm)dst(), scratch);
|
||||
a->vbroadcastss(dst(), (A::Xmm)dst());
|
||||
|
@ -310,6 +310,59 @@ DEF_TEST(SkVM_LoopCounts, r) {
|
||||
});
|
||||
}
|
||||
|
||||
DEF_TEST(SkVM_gather32, r) {
|
||||
skvm::Builder b;
|
||||
{
|
||||
skvm::Arg uniforms = b.uniform(),
|
||||
buf = b.varying<int>();
|
||||
skvm::I32 x = b.load32(buf);
|
||||
b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
|
||||
}
|
||||
|
||||
#if defined(SK_CPU_X86)
|
||||
test_jit_and_interpreter
|
||||
#else
|
||||
test_interpreter_only
|
||||
#endif
|
||||
(r, b.done(), [&](const skvm::Program& program) {
|
||||
const int img[] = {12,34,56,78, 90,98,76,54};
|
||||
|
||||
int buf[20];
|
||||
for (int i = 0; i < 20; i++) {
|
||||
buf[i] = i;
|
||||
}
|
||||
|
||||
struct Uniforms {
|
||||
const int* img;
|
||||
} uniforms{img};
|
||||
|
||||
program.eval(20, &uniforms, buf);
|
||||
int i = 0;
|
||||
REPORTER_ASSERT(r, buf[i] == 12); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 34); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 56); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 78); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 90); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 98); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 76); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 54); i++;
|
||||
|
||||
REPORTER_ASSERT(r, buf[i] == 12); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 34); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 56); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 78); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 90); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 98); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 76); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 54); i++;
|
||||
|
||||
REPORTER_ASSERT(r, buf[i] == 12); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 34); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 56); i++;
|
||||
REPORTER_ASSERT(r, buf[i] == 78); i++;
|
||||
});
|
||||
}
|
||||
|
||||
DEF_TEST(SkVM_gathers, r) {
|
||||
skvm::Builder b;
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user