Revert "Reland "gather8/16 JIT support""

This reverts commit 1283d55f35. Reason for revert: one more try... Original change's description: > Reland "gather8/16 JIT support" > > This is a reland of 54659e51bc > > ... now expecting not to JIT when under ASAN/MSAN. > > Original change's description: > > gather8/16 JIT support > > > > The basic strategy is one at a time, inserting 8- or 16-bit values > > into an Xmm register, then expanding to 32-bit in a Ymm at the end > > using vpmovzx{b,w}d instructions. > > > > Somewhat annoyingly we can only pull indices from an Xmm register, > > so we grab the first four then shift down the top before the rest. > > > > Added a unit test to get coverage where the indices are reused and > > not consumed directly by the gather instruction. It's an important > > case, needing to find another register for accum that can't just be > > dst(), but there's no natural coverage of that anywhere. > > > > Change-Id: I8189ead2364060f10537a2f9364d63338a7e596f > > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284311 > > Reviewed-by: Herb Derby <herb@google.com> > > Commit-Queue: Mike Klein <mtklein@google.com> > > Change-Id: I67f441615b312b47e7a3182e85e0f787286d7717 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284472 > Reviewed-by: Herb Derby <herb@google.com> > Commit-Queue: Mike Klein <mtklein@google.com> TBR=mtklein@google.com,herb@google.com Change-Id: I953fcd2aef308fd901880618fa540ac9f6d88e84 No-Presubmit: true No-Tree-Checks: true No-Try: true Reviewed-on: https://skia-review.googlesource.com/c/skia/+/284503 Reviewed-by: Mike Klein <mtklein@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2020-04-20 17:05:43 +00:00 · 2020-04-20 17:05:43 +00:00 · 7d5342d9ac
commit 7d5342d9ac
parent 1283d55f35
3 changed files with 26 additions and 163 deletions
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@ -36,6 +36,18 @@

 bool gSkVMJITViaDylib{false};

+// JIT code isn't MSAN-instrumented, so we won't see when it uses
+// uninitialized memory, and we'll not see the writes it makes as properly
+// initializing memory.  Instead force the interpreter, which should let
+// MSAN see everything our programs do properly.
+//
+// Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
+#if defined(__has_feature)
+    #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
+        #undef SKVM_JIT
+    #endif
+#endif
+
 #if defined(SKVM_JIT)
    #include <dlfcn.h>      // dlopen, dlsym
    #include <sys/mman.h>   // mmap, mprotect
@ -1987,14 +1999,6 @@ namespace skvm {
        this->byte(imm);
    }

-    void Assembler::vextracti128(Operand dst, Ymm src, int imm) {
-        this->op(0x66,0x3a0f,0x39, src,dst);
-        this->byte(imm);
-    }
-    void Assembler::vpextrd(Operand dst, Xmm src, int imm) {
-        this->op(0x66,0x3a0f,0x16, src,dst);
-        this->byte(imm);
-    }
    void Assembler::vpextrw(Operand dst, Xmm src, int imm) {
        this->op(0x66,0x3a0f,0x15, src,dst);
        this->byte(imm);
@ -3006,6 +3010,12 @@ namespace skvm {
            //
            // Now let's actually assemble the instruction!
            switch (op) {
+                default:
+                    if (debug_dump()) {
+                        SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
+                    }
+                    return false;  // TODO: many new ops
+
            #if defined(__x86_64__)
                case Op::assert_true: {
                    a->vptest (r[x], &constants[0xffffffff].label);
@ -3050,76 +3060,10 @@ namespace skvm {
                                 else        { a->vmovups(        dst(), A::Mem{arg[immy]}); }
                                 break;

-                case Op::gather8: {
-                    A::GP64 base = scratch,
-                           index = scratch2;
-
-                    // As usual, the gather base pointer is immz bytes off of uniform immy.
-                    a->mov(base, A::Mem{arg[immy], immz});
-
-                    // We'll need two distinct temporary vector registers:
-                    //   - tmp() to hold our indices;
-                    //   - accum to hold our partial gathered result.
-                    a->vmovdqa(tmp(), r[x]);
-
-                    // accum can be any register, even dst(), as long as it's not the same as tmp().
-                    A::Xmm accum;
-                    if (dst() != tmp()) {
-                        accum = (A::Xmm)dst();
-                    } else if (int found = __builtin_ffs(avail & ~(1<<tmp()))) {
-                        accum = (A::Xmm)(found-1);
-                    } else {
-                        ok = false;
-                        break;
-                    }
-                    SkASSERT((A::Xmm)tmp() != accum);
-
-                    for (int i = 0; i < (scalar ? 1 : 8); i++) {
-                        if (i == 4) {
-                            // vpextrd can only pluck indices out from an Xmm register,
-                            // so we manually swap over to the top when we're halfway through.
-                            a->vextracti128((A::Xmm)tmp(), tmp(), 1);
-                        }
-                        a->vpextrd(index, (A::Xmm)tmp(), i%4);
-                        a->vpinsrb(accum, accum, A::Mem{base,0,index,A::ONE}, i);
-                    }
-                    a->vpmovzxbd(dst(), accum);
-                } break;
-
-                case Op::gather16: {
-                    // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd.
-                    A::GP64 base = scratch,
-                           index = scratch2;
-
-                    a->mov(base, A::Mem{arg[immy], immz});
-
-                    a->vmovdqa(tmp(), r[x]);
-
-                    A::Xmm accum;
-                    if (dst() != tmp()) {
-                        accum = (A::Xmm)dst();
-                    } else if (int found = __builtin_ffs(avail & ~(1<<tmp()))) {
-                        accum = (A::Xmm)(found-1);
-                    } else {
-                        ok = false;
-                        break;
-                    }
-                    SkASSERT((A::Xmm)tmp() != accum);
-
-                    for (int i = 0; i < (scalar ? 1 : 8); i++) {
-                        if (i == 4) {
-                            a->vextracti128((A::Xmm)tmp(), tmp(), 1);
-                        }
-                        a->vpextrd(index, (A::Xmm)tmp(), i%4);
-                        a->vpinsrw(accum, accum, A::Mem{base,0,index,A::TWO}, i);
-                    }
-                    a->vpmovzxwd(dst(), accum);
-                } break;
-
                case Op::gather32:
                if (scalar) {
-                    A::GP64 base = scratch,
-                           index = scratch2;
+                    auto base  = scratch,
+                         index = scratch2;
                    // Our gather base pointer is immz bytes off of uniform immy.
                    a->mov(base, A::Mem{arg[immy], immz});

@ -3156,7 +3100,7 @@ namespace skvm {
                    }

                    // Our gather base pointer is immz bytes off of uniform immy.
-                    A::GP64 base = scratch;
+                    auto base = scratch;
                    a->mov(base, A::Mem{arg[immy], immz});
                    a->vpcmpeqd(mask, mask, mask);   // (All lanes enabled.)
                    a->vgatherdps(dst(), A::FOUR, index, base, mask);
@ -3263,12 +3207,6 @@ namespace skvm {
                case Op::round : a->vcvtps2dq (dst(), r[x]); break;

            #elif defined(__aarch64__)
-                default:
-                    if (debug_dump()) {
-                        SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
-                    }
-                    return false;  // TODO: many new ops
-
                case Op::assert_true: {
                    a->uminv4s(tmp(), r[x]);   // uminv acts like an all() across the vector.
                    a->fmovs(scratch, tmp());
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@ -23,18 +23,6 @@ class SkWStream;
    #define SKVM_LLVM
 #endif

-// JIT code isn't MSAN-instrumented, so we won't see when it uses
-// uninitialized memory, and we'll not see the writes it makes as properly
-// initializing memory.  Instead force the interpreter, which should let
-// MSAN see everything our programs do properly.
-//
-// Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
-#if defined(__has_feature)
-    #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
-        #undef SKVM_JIT
-    #endif
-#endif
-
 namespace skvm {

    bool fma_supported();
@ -199,10 +187,8 @@ namespace skvm {
        void vpinsrw(Xmm dst, Xmm src, Operand y, int imm);  // dst = src; dst[imm] = y, 16-bit
        void vpinsrb(Xmm dst, Xmm src, Operand y, int imm);  // dst = src; dst[imm] = y,  8-bit

-        void vextracti128(Operand dst, Ymm src, int imm);    // dst = src[imm], 128-bit
-        void vpextrd     (Operand dst, Xmm src, int imm);    // dst = src[imm],  32-bit
-        void vpextrw     (Operand dst, Xmm src, int imm);    // dst = src[imm],  16-bit
-        void vpextrb     (Operand dst, Xmm src, int imm);    // dst = src[imm],   8-bit
+        void vpextrw(Operand dst, Xmm src, int imm);         // dst = src[imm]           , 16-bit
+        void vpextrb(Operand dst, Xmm src, int imm);         // dst = src[imm]           ,  8-bit

        // if (mask & 0x8000'0000) {
        //     dst = base[scale*ix];
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@ -36,10 +36,10 @@ template <typename Fn>
 static void test_jit_and_interpreter(skvm::Program&& program, Fn&& test) {
 #if defined(SKVM_LLVM)
    SkASSERT(program.hasJIT());
-#elif defined(SKVM_JIT) && defined(SK_CPU_X86)
-    SkASSERT(program.hasJIT());
+#elif defined(SKVM_JIT) && defined(SK_CPU_X86)  // soon!
+    // SkASSERT(program.hasJIT());
 #elif defined(SKVM_JIT)                         // eventually!
-    //SkASSERT(program.hasJIT());
+    // SkASSERT(program.hasJIT());
 #else
    SkASSERT(!program.hasJIT());
 #endif
@ -439,55 +439,6 @@ DEF_TEST(SkVM_gathers, r) {
    });
 }

-DEF_TEST(SkVM_gathers2, r) {
-    skvm::Builder b;
-    {
-        skvm::Arg uniforms = b.uniform(),
-                  buf32    = b.varying<int>(),
-                  buf16    = b.varying<uint16_t>(),
-                  buf8     = b.varying<uint8_t>();
-
-        skvm::I32 x = b.load32(buf32);
-
-        b.store32(buf32, b.gather32(uniforms,0, x));
-        b.store16(buf16, b.gather16(uniforms,0, x));
-        b.store8 (buf8 , b.gather8 (uniforms,0, x));
-    }
-
-    test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
-        uint8_t img[256];
-        for (int i = 0; i < 256; i++) {
-            img[i] = i;
-        }
-
-        int      buf32[64];
-        uint16_t buf16[64];
-        uint8_t  buf8 [64];
-
-        for (int i = 0; i < 64; i++) {
-            buf32[i] = (i*47)&63;
-            buf16[i] = 0;
-            buf8 [i] = 0;
-        }
-
-        struct Uniforms {
-            const uint8_t* img;
-        } uniforms{img};
-
-        program.eval(64, &uniforms, buf32, buf16, buf8);
-
-        for (int i = 0; i < 64; i++) {
-            REPORTER_ASSERT(r, buf8[i] == ((i*47)&63));  // 0,47,30,13,60,...
-        }
-
-        REPORTER_ASSERT(r, buf16[ 0] == 0x0100);
-        REPORTER_ASSERT(r, buf16[63] == 0x2322);
-
-        REPORTER_ASSERT(r, buf32[ 0] == 0x03020100);
-        REPORTER_ASSERT(r, buf32[63] == 0x47464544);
-    });
-}
-
 DEF_TEST(SkVM_bitops, r) {
    skvm::Builder b;
    {
@ -1467,12 +1418,6 @@ DEF_TEST(SkVM_Assembler, r) {
        a.vpinsrb(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4);   // vpinsrb $4, (%rsi), %xmm8, %xmm1
        a.vpinsrb(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12);  // vpinsrb $4, (%rsi), %xmm8, %xmm1

-        a.vextracti128(A::xmm1, A::ymm8, 1);  // vextracti128 $1, %ymm8, %xmm1
-        a.vextracti128(A::xmm8, A::ymm1, 0);  // vextracti128 $0, %ymm1, %xmm8
-
-        a.vpextrd(A::Mem{A::rsi}, A::xmm8, 3);  // vpextrd  $3, %xmm8, (%rsi)
-        a.vpextrd(A::Mem{A::r8 }, A::xmm1, 2);  // vpextrd  $2, %xmm1, (%r8)
-
        a.vpextrw(A::Mem{A::rsi}, A::xmm8, 7);
        a.vpextrw(A::Mem{A::r8 }, A::xmm1, 15);

@ -1485,12 +1430,6 @@ DEF_TEST(SkVM_Assembler, r) {
        0xc4,0xe3,0x39, 0x20, 0x0e,  4,
        0xc4,0x43,0x71, 0x20, 0x00, 12,

-        0xc4,0x63,0x7d,0x39,0xc1, 1,
-        0xc4,0xc3,0x7d,0x39,0xc8, 0,
-
-        0xc4,0x63,0x79,0x16,0x06, 3,
-        0xc4,0xc3,0x79,0x16,0x08, 2,
-
        0xc4,0x63,0x79, 0x15, 0x06,  7,
        0xc4,0xc3,0x79, 0x15, 0x08, 15,