gather8/16 JIT support

The basic strategy is one at a time, inserting 8- or 16-bit values into an Xmm register, then expanding to 32-bit in a Ymm at the end using vpmovzx{b,w}d instructions. Somewhat annoyingly we can only pull indices from an Xmm register, so we grab the first four then shift down the top before the rest. Added a unit test to get coverage where the indices are reused and not consumed directly by the gather instruction. It's an important case, needing to find another register for accum that can't just be dst(), but there's no natural coverage of that anywhere. Change-Id: I8189ead2364060f10537a2f9364d63338a7e596f Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284311 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-04-17 13:57:13 -05:00 · 2020-04-17 13:57:13 -05:00 · 54659e51bc
commit 54659e51bc
parent f95e2f42a8
3 changed files with 149 additions and 12 deletions
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@ -1999,6 +1999,14 @@ namespace skvm {
        this->byte(imm);
    }

+    void Assembler::vextracti128(Operand dst, Ymm src, int imm) {
+        this->op(0x66,0x3a0f,0x39, src,dst);
+        this->byte(imm);
+    }
+    void Assembler::vpextrd(Operand dst, Xmm src, int imm) {
+        this->op(0x66,0x3a0f,0x16, src,dst);
+        this->byte(imm);
+    }
    void Assembler::vpextrw(Operand dst, Xmm src, int imm) {
        this->op(0x66,0x3a0f,0x15, src,dst);
        this->byte(imm);
@ -3010,12 +3018,6 @@ namespace skvm {
            //
            // Now let's actually assemble the instruction!
            switch (op) {
-                default:
-                    if (debug_dump()) {
-                        SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
-                    }
-                    return false;  // TODO: many new ops
-
            #if defined(__x86_64__)
                case Op::assert_true: {
                    a->vptest (r[x], &constants[0xffffffff].label);
@ -3060,10 +3062,76 @@ namespace skvm {
                                 else        { a->vmovups(        dst(), A::Mem{arg[immy]}); }
                                 break;

+                case Op::gather8: {
+                    A::GP64 base = scratch,
+                           index = scratch2;
+
+                    // As usual, the gather base pointer is immz bytes off of uniform immy.
+                    a->mov(base, A::Mem{arg[immy], immz});
+
+                    // We'll need two distinct temporary vector registers:
+                    //   - tmp() to hold our indices;
+                    //   - accum to hold our partial gathered result.
+                    a->vmovdqa(tmp(), r[x]);
+
+                    // accum can be any register, even dst(), as long as it's not the same as tmp().
+                    A::Xmm accum;
+                    if (dst() != tmp()) {
+                        accum = (A::Xmm)dst();
+                    } else if (int found = __builtin_ffs(avail & ~(1<<tmp()))) {
+                        accum = (A::Xmm)(found-1);
+                    } else {
+                        ok = false;
+                        break;
+                    }
+                    SkASSERT((A::Xmm)tmp() != accum);
+
+                    for (int i = 0; i < (scalar ? 1 : 8); i++) {
+                        if (i == 4) {
+                            // vpextrd can only pluck indices out from an Xmm register,
+                            // so we manually swap over to the top when we're halfway through.
+                            a->vextracti128((A::Xmm)tmp(), tmp(), 1);
+                        }
+                        a->vpextrd(index, (A::Xmm)tmp(), i%4);
+                        a->vpinsrb(accum, accum, A::Mem{base,0,index,A::ONE}, i);
+                    }
+                    a->vpmovzxbd(dst(), accum);
+                } break;
+
+                case Op::gather16: {
+                    // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd.
+                    A::GP64 base = scratch,
+                           index = scratch2;
+
+                    a->mov(base, A::Mem{arg[immy], immz});
+
+                    a->vmovdqa(tmp(), r[x]);
+
+                    A::Xmm accum;
+                    if (dst() != tmp()) {
+                        accum = (A::Xmm)dst();
+                    } else if (int found = __builtin_ffs(avail & ~(1<<tmp()))) {
+                        accum = (A::Xmm)(found-1);
+                    } else {
+                        ok = false;
+                        break;
+                    }
+                    SkASSERT((A::Xmm)tmp() != accum);
+
+                    for (int i = 0; i < (scalar ? 1 : 8); i++) {
+                        if (i == 4) {
+                            a->vextracti128((A::Xmm)tmp(), tmp(), 1);
+                        }
+                        a->vpextrd(index, (A::Xmm)tmp(), i%4);
+                        a->vpinsrw(accum, accum, A::Mem{base,0,index,A::TWO}, i);
+                    }
+                    a->vpmovzxwd(dst(), accum);
+                } break;
+
                case Op::gather32:
                if (scalar) {
-                    auto base  = scratch,
-                         index = scratch2;
+                    A::GP64 base = scratch,
+                           index = scratch2;
                    // Our gather base pointer is immz bytes off of uniform immy.
                    a->mov(base, A::Mem{arg[immy], immz});

@ -3100,7 +3168,7 @@ namespace skvm {
                    }

                    // Our gather base pointer is immz bytes off of uniform immy.
-                    auto base = scratch;
+                    A::GP64 base = scratch;
                    a->mov(base, A::Mem{arg[immy], immz});
                    a->vpcmpeqd(mask, mask, mask);   // (All lanes enabled.)
                    a->vgatherdps(dst(), A::FOUR, index, base, mask);
@ -3207,6 +3275,12 @@ namespace skvm {
                case Op::round : a->vcvtps2dq (dst(), r[x]); break;

            #elif defined(__aarch64__)
+                default:
+                    if (debug_dump()) {
+                        SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
+                    }
+                    return false;  // TODO: many new ops
+
                case Op::assert_true: {
                    a->uminv4s(tmp(), r[x]);   // uminv acts like an all() across the vector.
                    a->fmovs(scratch, tmp());
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@ -187,8 +187,10 @@ namespace skvm {
        void vpinsrw(Xmm dst, Xmm src, Operand y, int imm);  // dst = src; dst[imm] = y, 16-bit
        void vpinsrb(Xmm dst, Xmm src, Operand y, int imm);  // dst = src; dst[imm] = y,  8-bit

-        void vpextrw(Operand dst, Xmm src, int imm);         // dst = src[imm]           , 16-bit
-        void vpextrb(Operand dst, Xmm src, int imm);         // dst = src[imm]           ,  8-bit
+        void vextracti128(Operand dst, Ymm src, int imm);    // dst = src[imm], 128-bit
+        void vpextrd     (Operand dst, Xmm src, int imm);    // dst = src[imm],  32-bit
+        void vpextrw     (Operand dst, Xmm src, int imm);    // dst = src[imm],  16-bit
+        void vpextrb     (Operand dst, Xmm src, int imm);    // dst = src[imm],   8-bit

        // if (mask & 0x8000'0000) {
        //     dst = base[scale*ix];
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@ -37,7 +37,7 @@ static void test_jit_and_interpreter(skvm::Program&& program, Fn&& test) {
 #if defined(SKVM_LLVM)
    SkASSERT(program.hasJIT());
 #elif defined(SKVM_JIT) && defined(SK_CPU_X86)  // soon!
-    // SkASSERT(program.hasJIT());
+    SkASSERT(program.hasJIT());
 #elif defined(SKVM_JIT)                         // eventually!
    // SkASSERT(program.hasJIT());
 #else
@ -439,6 +439,55 @@ DEF_TEST(SkVM_gathers, r) {
    });
 }

+DEF_TEST(SkVM_gathers2, r) {
+    skvm::Builder b;
+    {
+        skvm::Arg uniforms = b.uniform(),
+                  buf32    = b.varying<int>(),
+                  buf16    = b.varying<uint16_t>(),
+                  buf8     = b.varying<uint8_t>();
+
+        skvm::I32 x = b.load32(buf32);
+
+        b.store32(buf32, b.gather32(uniforms,0, x));
+        b.store16(buf16, b.gather16(uniforms,0, x));
+        b.store8 (buf8 , b.gather8 (uniforms,0, x));
+    }
+
+    test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
+        uint8_t img[256];
+        for (int i = 0; i < 256; i++) {
+            img[i] = i;
+        }
+
+        int      buf32[64];
+        uint16_t buf16[64];
+        uint8_t  buf8 [64];
+
+        for (int i = 0; i < 64; i++) {
+            buf32[i] = (i*47)&63;
+            buf16[i] = 0;
+            buf8 [i] = 0;
+        }
+
+        struct Uniforms {
+            const uint8_t* img;
+        } uniforms{img};
+
+        program.eval(64, &uniforms, buf32, buf16, buf8);
+
+        for (int i = 0; i < 64; i++) {
+            REPORTER_ASSERT(r, buf8[i] == ((i*47)&63));  // 0,47,30,13,60,...
+        }
+
+        REPORTER_ASSERT(r, buf16[ 0] == 0x0100);
+        REPORTER_ASSERT(r, buf16[63] == 0x2322);
+
+        REPORTER_ASSERT(r, buf32[ 0] == 0x03020100);
+        REPORTER_ASSERT(r, buf32[63] == 0x47464544);
+    });
+}
+
 DEF_TEST(SkVM_bitops, r) {
    skvm::Builder b;
    {
@ -1418,6 +1467,12 @@ DEF_TEST(SkVM_Assembler, r) {
        a.vpinsrb(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4);   // vpinsrb $4, (%rsi), %xmm8, %xmm1
        a.vpinsrb(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12);  // vpinsrb $4, (%rsi), %xmm8, %xmm1

+        a.vextracti128(A::xmm1, A::ymm8, 1);  // vextracti128 $1, %ymm8, %xmm1
+        a.vextracti128(A::xmm8, A::ymm1, 0);  // vextracti128 $0, %ymm1, %xmm8
+
+        a.vpextrd(A::Mem{A::rsi}, A::xmm8, 3);  // vpextrd  $3, %xmm8, (%rsi)
+        a.vpextrd(A::Mem{A::r8 }, A::xmm1, 2);  // vpextrd  $2, %xmm1, (%r8)
+
        a.vpextrw(A::Mem{A::rsi}, A::xmm8, 7);
        a.vpextrw(A::Mem{A::r8 }, A::xmm1, 15);

@ -1430,6 +1485,12 @@ DEF_TEST(SkVM_Assembler, r) {
        0xc4,0xe3,0x39, 0x20, 0x0e,  4,
        0xc4,0x43,0x71, 0x20, 0x00, 12,

+        0xc4,0x63,0x7d,0x39,0xc1, 1,
+        0xc4,0xc3,0x7d,0x39,0xc8, 0,
+
+        0xc4,0x63,0x79,0x16,0x06, 3,
+        0xc4,0xc3,0x79,0x16,0x08, 2,
+
        0xc4,0x63,0x79, 0x15, 0x06,  7,
        0xc4,0xc3,0x79, 0x15, 0x08, 15,