impl gather32 for x86

Some TODOs left over to make the scalar
tail case better... as is it issues a
256-bit gather for each 32-bit load!

I added a trimmed down variant of the existing
SkVM_gathers unit test to test just gather32,
covering this new JIT code.

Change-Id: Iabd2e6a61f0213b6d02d222b9f7aec2be000b70b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/264217
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
This commit is contained in:
Mike Klein 2020-01-13 16:34:30 -06:00 committed by Skia Commit-Bot
parent 118df7cf62
commit b2b6a99dca
2 changed files with 90 additions and 0 deletions

View File

@ -2307,6 +2307,43 @@ namespace skvm {
else { a->vmovups( dst(), arg[immy]); }
break;
case Op::gather32: {
// We may not let any of dst(), index, or mask use the same register,
// so we must allocate registers manually and very carefully.
// index is argument x and has already been maybe_recycle_register()'d,
// so we explicitly ignore its availability during this op.
A::Ymm index = r[x];
uint32_t avail_during_gather = avail & ~(1<<index);
// Choose dst() to not overlap with index.
if (int found = __builtin_ffs(avail_during_gather)) {
set_dst((A::Ymm)(found-1));
avail_during_gather ^= (1<<dst());
} else {
ok = false;
break;
}
// Choose (temporary) mask to not overlap with dst() or index.
A::Ymm mask;
if (int found = __builtin_ffs(avail_during_gather)) {
mask = (A::Ymm)(found-1);
} else {
ok = false;
break;
}
// Our gather base pointer is immz bytes off of uniform immy.
a->movq(scratch, arg[immy], immz);
a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.)
a->vgatherdps(dst(), A::FOUR, index, scratch, mask);
// TODO: simpler impl. when scalar == true?
// TODO: at least disable the other mask lanes?
}
break;
case Op::uniform8: a->movzbl(scratch, arg[immy], immz);
a->vmovd_direct((A::Xmm)dst(), scratch);
a->vbroadcastss(dst(), (A::Xmm)dst());

View File

@ -310,6 +310,59 @@ DEF_TEST(SkVM_LoopCounts, r) {
});
}
DEF_TEST(SkVM_gather32, r) {
skvm::Builder b;
{
skvm::Arg uniforms = b.uniform(),
buf = b.varying<int>();
skvm::I32 x = b.load32(buf);
b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
}
#if defined(SK_CPU_X86)
test_jit_and_interpreter
#else
test_interpreter_only
#endif
(r, b.done(), [&](const skvm::Program& program) {
const int img[] = {12,34,56,78, 90,98,76,54};
int buf[20];
for (int i = 0; i < 20; i++) {
buf[i] = i;
}
struct Uniforms {
const int* img;
} uniforms{img};
program.eval(20, &uniforms, buf);
int i = 0;
REPORTER_ASSERT(r, buf[i] == 12); i++;
REPORTER_ASSERT(r, buf[i] == 34); i++;
REPORTER_ASSERT(r, buf[i] == 56); i++;
REPORTER_ASSERT(r, buf[i] == 78); i++;
REPORTER_ASSERT(r, buf[i] == 90); i++;
REPORTER_ASSERT(r, buf[i] == 98); i++;
REPORTER_ASSERT(r, buf[i] == 76); i++;
REPORTER_ASSERT(r, buf[i] == 54); i++;
REPORTER_ASSERT(r, buf[i] == 12); i++;
REPORTER_ASSERT(r, buf[i] == 34); i++;
REPORTER_ASSERT(r, buf[i] == 56); i++;
REPORTER_ASSERT(r, buf[i] == 78); i++;
REPORTER_ASSERT(r, buf[i] == 90); i++;
REPORTER_ASSERT(r, buf[i] == 98); i++;
REPORTER_ASSERT(r, buf[i] == 76); i++;
REPORTER_ASSERT(r, buf[i] == 54); i++;
REPORTER_ASSERT(r, buf[i] == 12); i++;
REPORTER_ASSERT(r, buf[i] == 34); i++;
REPORTER_ASSERT(r, buf[i] == 56); i++;
REPORTER_ASSERT(r, buf[i] == 78); i++;
});
}
DEF_TEST(SkVM_gathers, r) {
skvm::Builder b;
{