improve scalar gather32

This loads 32 bits instead of gathering 256 in the tail part of loops. To make it work, add a vmovd with SIB addressing. I also remembered that the mysterious 0b100 is actually a signal that the instruction uses SIB addressing, and is usually denoted by `rsp`. (SIB addressing may be something we'd want to generalize over like we did recently with YmmOrLabel, but I'll leave that for Future Me.) Slight rewording where "scratch" is mentioned to keep it focused on scratch GP registers, not "tmp" ymm registers. Not a hugely important distinction but helps when I'm grepping through code. Change-Id: I39a6ab1a76ea0c103ae7d3ebc97a1b7d4b530e73 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/264376 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2020-01-14 10:46:44 -06:00 · 2020-01-14 10:46:44 -06:00 · 93d3fabcc3
commit 93d3fabcc3
parent b2b6a99dca
3 changed files with 46 additions and 13 deletions
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@ -1306,6 +1306,18 @@ namespace skvm {
        this->byte(mod_rm(Mod::Indirect, dst&7, src&7));
    }

+    void Assembler::vmovd(Xmm dst, Scale scale, GP64 index, GP64 base) {
+        int prefix = 0x66,
+            map    = 0x0f,
+            opcode = 0x6e;
+        VEX v = vex(0, dst>>3, index>>3, base>>3,
+                    map, 0, /*ymm?*/0, prefix);
+        this->bytes(v.bytes, v.len);
+        this->byte(opcode);
+        this->byte(mod_rm(Mod::Indirect, dst&7, rsp));
+        this->byte(sib(scale, index&7, base&7));
+    }
+
    void Assembler::vmovd_direct(Xmm dst, GP64 src) {
        int prefix = 0x66,
            map    = 0x0f,
@ -1398,7 +1410,7 @@ namespace skvm {
                    map, mask, /*ymm?*/1, prefix);
        this->bytes(v.bytes, v.len);
        this->byte(opcode);
-        this->byte(mod_rm(Mod::Indirect, dst&7, 0b100/*TODO: what do these 0b100 bits mean?*/));
+        this->byte(mod_rm(Mod::Indirect, dst&7, rsp));
        this->byte(sib(scale, ix&7, base&7));
    }

@ -2089,9 +2101,10 @@ namespace skvm {
        if (!SkCpu::Supports(SkCpu::HSW)) {
            return false;
        }
-        A::GP64 N       = A::rdi,
-                scratch = A::rax,
-                arg[]   = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 };
+        A::GP64 N        = A::rdi,
+                scratch  = A::rax,
+                scratch2 = A::r11,
+                arg[]    = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 };

        // All 16 ymm registers are available to use.
        using Reg = A::Ymm;
@ -2171,7 +2184,7 @@ namespace skvm {
            // We track each instruction's dst in r[] so we can thread it through as an input
            // to any future instructions needing that value.
            //
-            // And some ops may need a temporary scratch register, tmp.  Some need both tmp and dst.
+            // And some ops may need a temporary register, tmp.  Some need both tmp and dst.
            //
            // tmp and dst are very similar and can and will often be assigned the same register,
            // but tmp may never alias any of the instructions's inputs, while dst may when this
@ -2190,7 +2203,7 @@ namespace skvm {
                if (!tmp_is_set) {
                    tmp_is_set = true;
                    if (int found = __builtin_ffs(avail)) {
-                        // This is a scratch register just for this op,
+                        // This is a temporary register just for this op,
                        // so we leave it marked available for future ops.
                        tmp_reg = (Reg)(found - 1);
                    } else {
@ -2307,7 +2320,19 @@ namespace skvm {
                                 else        { a->vmovups(        dst(), arg[immy]); }
                                 break;

-                case Op::gather32: {
+                case Op::gather32:
+                if (scalar) {
+                    auto base  = scratch,
+                         index = scratch2;
+                    // Our gather base pointer is immz bytes off of uniform immy.
+                    a->movq(base, arg[immy], immz);
+
+                    // Grab our index from lane 0 of the index argument.
+                    a->vmovd_direct(index, (A::Xmm)r[x]);
+
+                    // dst = *(base + 4*index)
+                    a->vmovd((A::Xmm)dst(), A::FOUR, index, base);
+                } else {
                    // We may not let any of dst(), index, or mask use the same register,
                    // so we must allocate registers manually and very carefully.

@ -2335,12 +2360,10 @@ namespace skvm {
                    }

                    // Our gather base pointer is immz bytes off of uniform immy.
-                    a->movq(scratch, arg[immy], immz);
+                    auto base = scratch;
+                    a->movq(base, arg[immy], immz);
                    a->vpcmpeqd(mask, mask, mask);   // (All lanes enabled.)
-                    a->vgatherdps(dst(), A::FOUR, index, scratch, mask);
-
-                    // TODO: simpler impl. when scalar == true?
-                    // TODO: at least disable the other mask lanes?
+                    a->vgatherdps(dst(), A::FOUR, index, base, mask);
                }
                break;

--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@ -139,6 +139,9 @@ namespace skvm {
        void vpmovzxbd(Ymm dst, GP64 ptr);   // dst = *ptr,  64-bit, each uint8_t  expanded to int
        void vmovd    (Xmm dst, GP64 ptr);   // dst = *ptr,  32-bit

+        enum Scale { ONE, TWO, FOUR, EIGHT };
+        void vmovd(Xmm dst, Scale, GP64 index, GP64 base);   // dst = *(base + scale*index),  32-bit
+
        void vmovups(GP64 ptr, Ymm src);     // *ptr = src, 256-bit
        void vmovups(GP64 ptr, Xmm src);     // *ptr = src, 128-bit
        void vmovq  (GP64 ptr, Xmm src);     // *ptr = src,  64-bit
@ -160,7 +163,6 @@ namespace skvm {
        //     dst = base[scale*ix];
        // }
        // mask = 0;
-        enum Scale { ONE, TWO, FOUR, EIGHT };
        void vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask);

        // aarch64
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@ -1084,6 +1084,10 @@ DEF_TEST(SkVM_Assembler, r) {
        a.vmovd(A::xmm8, A::rax);
        a.vmovd(A::xmm0, A::r8);

+        a.vmovd(A::xmm0 , A::FOUR, A::rcx, A::rax);
+        a.vmovd(A::xmm15, A::TWO,  A::r8,  A::rax);
+        a.vmovd(A::xmm0 , A::ONE,  A::rcx, A::r8);
+
        a.vmovd_direct(A::rax, A::xmm0);
        a.vmovd_direct(A::rax, A::xmm8);
        a.vmovd_direct(A::r8,  A::xmm0);
@ -1110,6 +1114,10 @@ DEF_TEST(SkVM_Assembler, r) {
        0xc5,0x79,0x6e,0x00,
        0xc4,0xc1,0x79,0x6e,0x00,

+        0xc5,0xf9,0x6e,0x04,0x88,
+        0xc4,0x21,0x79,0x6e,0x3c,0x40,
+        0xc4,0xc1,0x79,0x6e,0x04,0x08,
+
        0xc5,0xf9,0x7e,0xc0,
        0xc5,0x79,0x7e,0xc0,
        0xc4,0xc1,0x79,0x7e,0xc0,