move SkVM interpreter back to SkVM.cpp

No real pressing reason to have an AVX2-specialized interpreter now that we've got an AVX2 JIT. Keeping things centralized makes it easier to keep track of and helps reduce dependendcies on the rest of Skia, which in turn makes it easier to develop SkVM on wimpy machines like an RPi. Change-Id: Ic4729603ec6c9141929b3e2a56ba380240ea5e0d Reviewed-on: https://skia-review.googlesource.com/c/skia/+/224822 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@google.com>
2019-07-01 11:45:13 -05:00 · 2019-07-01 11:45:13 -05:00 · 1e87a5410e
commit 1e87a5410e
parent 1d43530aa2
6 changed files with 165 additions and 202 deletions
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@ -43,7 +43,6 @@
 #include "src/opts/SkRasterPipeline_opts.h"
 #include "src/opts/SkSwizzler_opts.h"
 #include "src/opts/SkUtils_opts.h"
-#include "src/opts/SkVM_opts.h"
 #include "src/opts/SkXfermode_opts.h"

 namespace SkOpts {
@ -81,8 +80,6 @@ namespace SkOpts {
    DEFINE_DEFAULT(hash_fn);

    DEFINE_DEFAULT(S32_alpha_D32_filter_DX);
-
-    DEFINE_DEFAULT(eval);
 #undef DEFINE_DEFAULT

 #define M(st) (StageFn)SK_OPTS_NS::st,
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@ -14,8 +14,6 @@

 struct SkBitmapProcState;

-namespace skvm { struct ProgramInstruction; }
-
 namespace SkOpts {
    // Call to replace pointers to portable functions with pointers to CPU-specific functions.
    // Thread-safe and idempotent.
@ -76,9 +74,6 @@ namespace SkOpts {
    extern void (*start_pipeline_lowp )(size_t,size_t,size_t,size_t, void**);
 #undef M

-    extern void (*eval)(const skvm::ProgramInstruction[], int ninsts, int nregs, int loop,
-                        int n, void* args[], size_t strides[], int nargs);
-
 }

 #endif//SkOpts_DEFINED
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@ -10,7 +10,6 @@
 #include "include/private/SkThreadID.h"
 #include "include/private/SkVx.h"
 #include "src/core/SkCpu.h"
-#include "src/core/SkOpts.h"
 #include "src/core/SkVM.h"
 #include <string.h>
 #if defined(SKVM_JIT)
@ -1240,7 +1239,16 @@ namespace skvm {
                static SkSpinlock dump_lock;
                SkAutoSpinlock lock(dump_lock);

-                uint32_t hash = SkOpts::hash(fJIT.buf, fJIT.size);
+                auto fnv1a = [](const void* vbuf, size_t n) {
+                    uint32_t hash = 2166136261;
+                    for (auto buf = (const uint8_t*)vbuf; n --> 0; buf++) {
+                        hash ^= *buf;
+                        hash *= 16777619;
+                    }
+                    return hash;
+                };
+
+                uint32_t hash = fnv1a(fJIT.buf, fJIT.size);
                SkString name = SkStringPrintf("skvm-jit-%u", hash);

                // Create a jit-<pid>.dump file that we can `perf inject -j` into a
@ -1342,9 +1350,157 @@ namespace skvm {
                SkASSERT(arg == args + nargs);
            }
        }
+
        if (n) {
-            SkOpts::eval(fInstructions.data(), (int)fInstructions.size(), fRegs, fLoop,
-                         n, args, strides, nargs);
+            // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
+            constexpr int K = 16;
+            using I32 = skvx::Vec<K, int>;
+            using F32 = skvx::Vec<K, float>;
+            using U32 = skvx::Vec<K, uint32_t>;
+            using  U8 = skvx::Vec<K, uint8_t>;
+
+            using I16x2 = skvx::Vec<2*K,  int16_t>;
+            using U16x2 = skvx::Vec<2*K, uint16_t>;
+
+            union Slot {
+                I32 i32;
+                U32 u32;
+                F32 f32;
+            };
+
+            Slot                     few_regs[16];
+            std::unique_ptr<char[]> many_regs;
+
+            Slot* regs = few_regs;
+
+            if (fRegs > (int)SK_ARRAY_COUNT(few_regs)) {
+                // Annoyingly we can't trust that malloc() or new will work with Slot because
+                // the skvx::Vec types may have alignment greater than what they provide.
+                // We'll overallocate one extra register so we can align manually.
+                many_regs.reset(new char[ sizeof(Slot) * (fRegs + 1) ]);
+
+                uintptr_t addr = (uintptr_t)many_regs.get();
+                addr += alignof(Slot) -
+                         (addr & (alignof(Slot) - 1));
+                SkASSERT((addr & (alignof(Slot) - 1)) == 0);
+                regs = (Slot*)addr;
+            }
+
+
+            auto r = [&](Reg id) -> Slot& {
+                SkASSERT(0 <= id && id < fRegs);
+                return regs[id];
+            };
+            auto arg = [&](int ix) {
+                SkASSERT(0 <= ix && ix < nargs);
+                return args[ix];
+            };
+
+            // Step each argument pointer ahead by its stride a number of times.
+            auto step_args = [&](int times) {
+                // Looping by marching pointers until *arg == nullptr helps the
+                // compiler to keep this loop scalar.  Otherwise it'd create a
+                // rather large and useless autovectorized version.
+                void**        arg    = args;
+                const size_t* stride = strides;
+                for (; *arg; arg++, stride++) {
+                    *arg = (void*)( (char*)*arg + times * *stride );
+                }
+                SkASSERT(arg == args + nargs);
+            };
+
+            int start = 0,
+                stride;
+            for ( ; n > 0; start = fLoop, n -= stride, step_args(stride)) {
+                stride = n >= K ? K : 1;
+
+                for (int i = start; i < (int)fInstructions.size(); i++) {
+                    Instruction inst = fInstructions[i];
+
+                    // d = op(x,y,z/imm)
+                    Reg   d = inst.d,
+                          x = inst.x,
+                          y = inst.y,
+                          z = inst.z;
+                    int imm = inst.imm;
+
+                    // Ops that interact with memory need to know whether we're stride=1 or K,
+                    // but all non-memory ops can run the same code no matter the stride.
+                    switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
+
+                    #define STRIDE_1(op) case 2*(int)op
+                    #define STRIDE_K(op) case 2*(int)op + 1
+                        STRIDE_1(Op::store8 ): memcpy(arg(imm), &r(x).i32, 1); break;
+                        STRIDE_1(Op::store32): memcpy(arg(imm), &r(x).i32, 4); break;
+
+                        STRIDE_K(Op::store8 ): skvx::cast<uint8_t>(r(x).i32).store(arg(imm)); break;
+                        STRIDE_K(Op::store32):                    (r(x).i32).store(arg(imm)); break;
+
+                        STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 1); break;
+                        STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 4); break;
+
+                        STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(imm))); break;
+                        STRIDE_K(Op::load32): r(d).i32=                 I32::Load(arg(imm)) ; break;
+                    #undef STRIDE_1
+                    #undef STRIDE_K
+
+                        // Ops that don't interact with memory should never care about the stride.
+                    #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
+                        CASE(Op::splat): r(d).i32 = imm; break;
+
+                        CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
+                        CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
+                        CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
+                        CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
+
+                        CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break;
+
+                        CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
+                        CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
+                        CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
+
+                        CASE(Op::sub_i16x2):
+                            r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) -
+                                                          skvx::bit_pun<I16x2>(r(y).i32) ); break;
+                        CASE(Op::mul_i16x2):
+                            r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) *
+                                                          skvx::bit_pun<I16x2>(r(y).i32) ); break;
+                        CASE(Op::shr_i16x2):
+                            r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<U16x2>(r(x).i32) >> imm);
+                            break;
+
+                        CASE(Op::bit_and):   r(d).i32 = r(x).i32 &  r(y).i32; break;
+                        CASE(Op::bit_or ):   r(d).i32 = r(x).i32 |  r(y).i32; break;
+                        CASE(Op::bit_xor):   r(d).i32 = r(x).i32 ^  r(y).i32; break;
+                        CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
+
+                        CASE(Op::shl): r(d).i32 = r(x).i32 << imm; break;
+                        CASE(Op::sra): r(d).i32 = r(x).i32 >> imm; break;
+                        CASE(Op::shr): r(d).u32 = r(x).u32 >> imm; break;
+
+                        CASE(Op::extract): r(d).u32 = (r(x).u32 >> imm) & r(y).u32; break;
+                        CASE(Op::pack):    r(d).u32 = r(x).u32 | (r(y).u32 << imm); break;
+
+                        CASE(Op::bytes): {
+                            const U32 table[] = {
+                                0,
+                                (r(x).u32      ) & 0xff,
+                                (r(x).u32 >>  8) & 0xff,
+                                (r(x).u32 >> 16) & 0xff,
+                                (r(x).u32 >> 24) & 0xff,
+                            };
+                            r(d).u32 = table[(imm >>  0) & 0xf] <<  0
+                                     | table[(imm >>  4) & 0xf] <<  8
+                                     | table[(imm >>  8) & 0xf] << 16
+                                     | table[(imm >> 12) & 0xf] << 24;
+                        } break;
+
+                        CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
+                        CASE(Op::to_i32): r(d).i32 = skvx::cast<int>  (r(x).f32); break;
+                    #undef CASE
+                    }
+                }
+            }
        }
    }
 }
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@ -192,16 +192,13 @@ namespace skvm {

    using Reg = int;

-    struct ProgramInstruction {   // d = op(x, y, z/imm)
-        Op  op;
-        Reg d,x,y;
-        union { Reg z; int imm; };
-    };
-
    class Program {
    public:
-        // Moved outside Program so it can be forward-declared.
-        using Instruction = ProgramInstruction;
+        struct Instruction {   // d = op(x, y, z/imm)
+            Op  op;
+            Reg d,x,y;
+            union { Reg z; int imm; };
+        };

        Program(std::vector<Instruction>, int regs, int loop);
        Program() : Program({}, 0, 0) {}
--- a/src/opts/SkOpts_hsw.cpp
+++ b/src/opts/SkOpts_hsw.cpp
@ -11,7 +11,6 @@
 #include "src/opts/SkBlitRow_opts.h"
 #include "src/opts/SkRasterPipeline_opts.h"
 #include "src/opts/SkUtils_opts.h"
-#include "src/opts/SkVM_opts.h"

 namespace SkOpts {
    void Init_hsw() {
@ -29,7 +28,5 @@ namespace SkOpts {
        just_return_lowp = (StageFn)SK_OPTS_NS::lowp::just_return;
        start_pipeline_lowp = SK_OPTS_NS::lowp::start_pipeline;
    #undef M
-
-        eval = hsw::eval;
    }
 }
--- a/src/opts/SkVM_opts.h
+++ b/src/opts/SkVM_opts.h
@ -1,179 +0,0 @@
-/*
- * Copyright 2019 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef SkVM_opts_DEFINED
-#define SkVM_opts_DEFINED
-
-#include "src/core/SkVM.h"
-
-#include "include/private/SkVx.h"
-
-namespace SK_OPTS_NS {
-
-    inline void eval(const skvm::Program::Instruction insts[], const int ninsts,
-                     const int nregs, const int loop,
-                     int n, void* args[], size_t strides[], const int nargs) {
-        using namespace skvm;
-
-        // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
-    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
-        constexpr int K = 32;
-    #else
-        constexpr int K = 16;
-    #endif
-        using I32 = skvx::Vec<K, int>;
-        using F32 = skvx::Vec<K, float>;
-        using U32 = skvx::Vec<K, uint32_t>;
-        using  U8 = skvx::Vec<K, uint8_t>;
-
-        using I16x2 = skvx::Vec<2*K,  int16_t>;
-        using U16x2 = skvx::Vec<2*K, uint16_t>;
-
-        union Slot {
-            I32 i32;
-            U32 u32;
-            F32 f32;
-        };
-
-        Slot                     few_regs[16];
-        std::unique_ptr<char[]> many_regs;
-
-        Slot* regs = few_regs;
-
-        if (nregs > (int)SK_ARRAY_COUNT(few_regs)) {
-            // Annoyingly we can't trust that malloc() or new will work with Slot because
-            // the skvx::Vec types may have alignment greater than what they provide.
-            // We'll overallocate one extra register so we can align manually.
-            many_regs.reset(new char[ sizeof(Slot) * (nregs + 1) ]);
-
-            uintptr_t addr = (uintptr_t)many_regs.get();
-            addr += alignof(Slot) -
-                     (addr & (alignof(Slot) - 1));
-            SkASSERT((addr & (alignof(Slot) - 1)) == 0);
-            regs = (Slot*)addr;
-        }
-
-
-        auto r = [&](Reg id) -> Slot& {
-            SkASSERT(0 <= id && id < nregs);
-            return regs[id];
-        };
-        auto arg = [&](int ix) {
-            SkASSERT(0 <= ix && ix < nargs);
-            return args[ix];
-        };
-
-        // Step each argument pointer ahead by its stride a number of times.
-        auto step_args = [&](int times) {
-            // Looping by marching pointers until *arg == nullptr helps the
-            // compiler to keep this loop scalar.  Otherwise it'd create a
-            // rather large and useless autovectorized version.
-            void**        arg    = args;
-            const size_t* stride = strides;
-            for (; *arg; arg++, stride++) {
-                *arg = (void*)( (char*)*arg + times * *stride );
-            }
-            SkASSERT(arg == args + nargs);
-        };
-
-        int start = 0,
-            stride;
-        for ( ; n > 0; start = loop, n -= stride, step_args(stride)) {
-            stride = n >= K ? K : 1;
-
-            for (int i = start; i < ninsts; i++) {
-                skvm::Program::Instruction inst = insts[i];
-
-                // d = op(x,y,z/imm)
-                Reg   d = inst.d,
-                      x = inst.x,
-                      y = inst.y,
-                      z = inst.z;
-                int imm = inst.imm;
-
-                // Ops that interact with memory need to know whether we're stride=1 or stride=K,
-                // but all non-memory ops can run the same code no matter the stride.
-                switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
-
-                #define STRIDE_1(op) case 2*(int)op
-                #define STRIDE_K(op) case 2*(int)op + 1
-                    STRIDE_1(Op::store8 ): memcpy(arg(imm), &r(x).i32, 1); break;
-                    STRIDE_1(Op::store32): memcpy(arg(imm), &r(x).i32, 4); break;
-
-                    STRIDE_K(Op::store8 ): skvx::cast<uint8_t>(r(x).i32).store(arg(imm)); break;
-                    STRIDE_K(Op::store32):                    (r(x).i32).store(arg(imm)); break;
-
-                    STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 1); break;
-                    STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 4); break;
-
-                    STRIDE_K(Op::load8 ): r(d).i32 = skvx::cast<int>(U8 ::Load(arg(imm))); break;
-                    STRIDE_K(Op::load32): r(d).i32 =                 I32::Load(arg(imm)) ; break;
-                #undef STRIDE_1
-                #undef STRIDE_K
-
-                    // Ops that don't interact with memory should never care about the stride.
-                #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
-                    CASE(Op::splat): r(d).i32 = imm; break;
-
-                    CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
-                    CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
-                    CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
-                    CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
-
-                    CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break;
-
-                    CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
-                    CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
-                    CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
-
-                    CASE(Op::sub_i16x2):
-                        r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) -
-                                                      skvx::bit_pun<I16x2>(r(y).i32) ); break;
-                    CASE(Op::mul_i16x2):
-                        r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) *
-                                                      skvx::bit_pun<I16x2>(r(y).i32) ); break;
-                    CASE(Op::shr_i16x2):
-                        r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<U16x2>(r(x).i32) >> imm);
-                        break;
-
-                    CASE(Op::bit_and):   r(d).i32 = r(x).i32 &  r(y).i32; break;
-                    CASE(Op::bit_or ):   r(d).i32 = r(x).i32 |  r(y).i32; break;
-                    CASE(Op::bit_xor):   r(d).i32 = r(x).i32 ^  r(y).i32; break;
-                    CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
-
-                    CASE(Op::shl): r(d).i32 = r(x).i32 << imm; break;
-                    CASE(Op::sra): r(d).i32 = r(x).i32 >> imm; break;
-                    CASE(Op::shr): r(d).u32 = r(x).u32 >> imm; break;
-
-                    CASE(Op::extract): r(d).u32 = (r(x).u32 >> imm) & r(y).u32; break;
-                    CASE(Op::pack):    r(d).u32 = r(x).u32 | (r(y).u32 << imm); break;
-
-                    CASE(Op::bytes): {
-                        const U32 table[] = {
-                            0,
-                            (r(x).u32      ) & 0xff,
-                            (r(x).u32 >>  8) & 0xff,
-                            (r(x).u32 >> 16) & 0xff,
-                            (r(x).u32 >> 24) & 0xff,
-                        };
-                        r(d).u32 = table[(imm >>  0) & 0xf] <<  0
-                                 | table[(imm >>  4) & 0xf] <<  8
-                                 | table[(imm >>  8) & 0xf] << 16
-                                 | table[(imm >> 12) & 0xf] << 24;
-                    } break;
-
-                    CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
-                    CASE(Op::to_i32): r(d).i32 = skvx::cast<int>  (r(x).f32); break;
-                #undef CASE
-                }
-            }
-        }
-    }
-
-}  // namespace SK_OPTS_NS
-
-#endif//SkVM_opts_DEFINED