improve scalar gather32

This loads 32 bits instead of gathering 256 in the tail part of loops.

To make it work, add a vmovd with SIB addressing.

I also remembered that the mysterious 0b100 is actually a signal that
the instruction uses SIB addressing, and is usually denoted by `rsp`.

(SIB addressing may be something we'd want to generalize over like we
did recently with YmmOrLabel, but I'll leave that for Future Me.)

Slight rewording where "scratch" is mentioned to keep it focused on
scratch GP registers, not "tmp" ymm registers.  Not a hugely important
distinction but helps when I'm grepping through code.

Change-Id: I39a6ab1a76ea0c103ae7d3ebc97a1b7d4b530e73
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/264376
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
Mike Klein 2020-01-14 10:46:44 -06:00 committed by Skia Commit-Bot
parent b2b6a99dca
commit 93d3fabcc3
3 changed files with 46 additions and 13 deletions

View File

@ -1306,6 +1306,18 @@ namespace skvm {
this->byte(mod_rm(Mod::Indirect, dst&7, src&7));
}
void Assembler::vmovd(Xmm dst, Scale scale, GP64 index, GP64 base) {
int prefix = 0x66,
map = 0x0f,
opcode = 0x6e;
VEX v = vex(0, dst>>3, index>>3, base>>3,
map, 0, /*ymm?*/0, prefix);
this->bytes(v.bytes, v.len);
this->byte(opcode);
this->byte(mod_rm(Mod::Indirect, dst&7, rsp));
this->byte(sib(scale, index&7, base&7));
}
void Assembler::vmovd_direct(Xmm dst, GP64 src) {
int prefix = 0x66,
map = 0x0f,
@ -1398,7 +1410,7 @@ namespace skvm {
map, mask, /*ymm?*/1, prefix);
this->bytes(v.bytes, v.len);
this->byte(opcode);
this->byte(mod_rm(Mod::Indirect, dst&7, 0b100/*TODO: what do these 0b100 bits mean?*/));
this->byte(mod_rm(Mod::Indirect, dst&7, rsp));
this->byte(sib(scale, ix&7, base&7));
}
@ -2091,6 +2103,7 @@ namespace skvm {
}
A::GP64 N = A::rdi,
scratch = A::rax,
scratch2 = A::r11,
arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 };
// All 16 ymm registers are available to use.
@ -2171,7 +2184,7 @@ namespace skvm {
// We track each instruction's dst in r[] so we can thread it through as an input
// to any future instructions needing that value.
//
// And some ops may need a temporary scratch register, tmp. Some need both tmp and dst.
// And some ops may need a temporary register, tmp. Some need both tmp and dst.
//
// tmp and dst are very similar and can and will often be assigned the same register,
// but tmp may never alias any of the instructions's inputs, while dst may when this
@ -2190,7 +2203,7 @@ namespace skvm {
if (!tmp_is_set) {
tmp_is_set = true;
if (int found = __builtin_ffs(avail)) {
// This is a scratch register just for this op,
// This is a temporary register just for this op,
// so we leave it marked available for future ops.
tmp_reg = (Reg)(found - 1);
} else {
@ -2307,7 +2320,19 @@ namespace skvm {
else { a->vmovups( dst(), arg[immy]); }
break;
case Op::gather32: {
case Op::gather32:
if (scalar) {
auto base = scratch,
index = scratch2;
// Our gather base pointer is immz bytes off of uniform immy.
a->movq(base, arg[immy], immz);
// Grab our index from lane 0 of the index argument.
a->vmovd_direct(index, (A::Xmm)r[x]);
// dst = *(base + 4*index)
a->vmovd((A::Xmm)dst(), A::FOUR, index, base);
} else {
// We may not let any of dst(), index, or mask use the same register,
// so we must allocate registers manually and very carefully.
@ -2335,12 +2360,10 @@ namespace skvm {
}
// Our gather base pointer is immz bytes off of uniform immy.
a->movq(scratch, arg[immy], immz);
auto base = scratch;
a->movq(base, arg[immy], immz);
a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.)
a->vgatherdps(dst(), A::FOUR, index, scratch, mask);
// TODO: simpler impl. when scalar == true?
// TODO: at least disable the other mask lanes?
a->vgatherdps(dst(), A::FOUR, index, base, mask);
}
break;

View File

@ -139,6 +139,9 @@ namespace skvm {
void vpmovzxbd(Ymm dst, GP64 ptr); // dst = *ptr, 64-bit, each uint8_t expanded to int
void vmovd (Xmm dst, GP64 ptr); // dst = *ptr, 32-bit
enum Scale { ONE, TWO, FOUR, EIGHT };
void vmovd(Xmm dst, Scale, GP64 index, GP64 base); // dst = *(base + scale*index), 32-bit
void vmovups(GP64 ptr, Ymm src); // *ptr = src, 256-bit
void vmovups(GP64 ptr, Xmm src); // *ptr = src, 128-bit
void vmovq (GP64 ptr, Xmm src); // *ptr = src, 64-bit
@ -160,7 +163,6 @@ namespace skvm {
// dst = base[scale*ix];
// }
// mask = 0;
enum Scale { ONE, TWO, FOUR, EIGHT };
void vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask);
// aarch64

View File

@ -1084,6 +1084,10 @@ DEF_TEST(SkVM_Assembler, r) {
a.vmovd(A::xmm8, A::rax);
a.vmovd(A::xmm0, A::r8);
a.vmovd(A::xmm0 , A::FOUR, A::rcx, A::rax);
a.vmovd(A::xmm15, A::TWO, A::r8, A::rax);
a.vmovd(A::xmm0 , A::ONE, A::rcx, A::r8);
a.vmovd_direct(A::rax, A::xmm0);
a.vmovd_direct(A::rax, A::xmm8);
a.vmovd_direct(A::r8, A::xmm0);
@ -1110,6 +1114,10 @@ DEF_TEST(SkVM_Assembler, r) {
0xc5,0x79,0x6e,0x00,
0xc4,0xc1,0x79,0x6e,0x00,
0xc5,0xf9,0x6e,0x04,0x88,
0xc4,0x21,0x79,0x6e,0x3c,0x40,
0xc4,0xc1,0x79,0x6e,0x04,0x08,
0xc5,0xf9,0x7e,0xc0,
0xc5,0x79,0x7e,0xc0,
0xc4,0xc1,0x79,0x7e,0xc0,