Interpreter: Support striped inputs for less overhead

Change-Id: I8c7bd5ed3fb6aebbfb1c5c224acfd73862252621 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/220778 Commit-Queue: Brian Osman <brianosman@google.com> Reviewed-by: Mike Klein <mtklein@google.com> Reviewed-by: Mike Reed <reed@google.com>
2019-06-18 11:03:10 -04:00 · 2019-06-18 11:03:10 -04:00 · 5d89b66ff6
commit 5d89b66ff6
parent fb32ddf622
6 changed files with 121 additions and 23 deletions
--- a/bench/SkSLInterpreterBench.cpp
+++ b/bench/SkSLInterpreterBench.cpp
@ -12,10 +12,11 @@
 // Benchmarks the interpreter with a function that has a color-filter style signature
 class SkSLInterpreterCFBench : public Benchmark {
 public:
-    SkSLInterpreterCFBench(SkSL::String name, int pixels, const char* src)
-        : fName(SkStringPrintf("sksl_interp_cf_%d_%s", pixels, name.c_str()))
+    SkSLInterpreterCFBench(SkSL::String name, int pixels, bool striped, const char* src)
+        : fName(SkStringPrintf("sksl_interp_cf_%d_%d_%s", pixels, striped ? 1 : 0, name.c_str()))
        , fSrc(src)
-        , fCount(pixels) {}
+        , fCount(pixels)
+        , fStriped(striped) {}

 protected:
    const char* onGetName() override {
@ -44,7 +45,18 @@ protected:

    void onDraw(int loops, SkCanvas*) override {
        for (int i = 0; i < loops; i++) {
-            fByteCode->run(fMain, fPixels.data(), nullptr, fCount, nullptr, 0);
+            if (fStriped) {
+                float* args[] = {
+                    fPixels.data() + 0 * fCount,
+                    fPixels.data() + 1 * fCount,
+                    fPixels.data() + 2 * fCount,
+                    fPixels.data() + 3 * fCount,
+                };
+
+                fByteCode->runStriped(fMain, args, 4, fCount, nullptr, 0);
+            } else {
+                fByteCode->run(fMain, fPixels.data(), nullptr, fCount, nullptr, 0);
+            }
        }
    }

@ -55,6 +67,7 @@ private:
    const SkSL::ByteCodeFunction* fMain;

    int fCount;
+    bool fStriped;
    std::vector<float> fPixels;

    typedef Benchmark INHERITED;
@ -62,16 +75,16 @@ private:

 ///////////////////////////////////////////////////////////////////////////////

-DEF_BENCH(return new SkSLInterpreterCFBench("lumaToAlpha", 256, R"(
+const char* kLumaToAlphaSrc = R"(
    void main(inout float4 color) {
        color.a = color.r*0.3 + color.g*0.6 + color.b*0.1;
        color.r = 0;
        color.g = 0;
        color.b = 0;
    }
-)"));
+)";

-DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, R"(
+const char* kHighContrastFilterSrc = R"(
    half ucontrast_Stage2;
    half hue2rgb_Stage2(half p, half q, half t) {
        if (t < 0)  t += 1;
@ -129,7 +142,13 @@ DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, R"(
        color.rgb = sqrt(color.rgb);
        color.rgb *= color.a;
    }
-)"));
+)";
+
+DEF_BENCH(return new SkSLInterpreterCFBench("lumaToAlpha", 256, false, kLumaToAlphaSrc));
+DEF_BENCH(return new SkSLInterpreterCFBench("lumaToAlpha", 256, true, kLumaToAlphaSrc));
+
+DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, false, kHighContrastFilterSrc));
+DEF_BENCH(return new SkSLInterpreterCFBench("hcf", 256, true, kHighContrastFilterSrc));

 class SkSLInterpreterSortBench : public Benchmark {
 public:
--- a/src/core/SkColorFilter.cpp
+++ b/src/core/SkColorFilter.cpp
@ -404,13 +404,7 @@ public:
            };
            rec.fPipeline->append(SkRasterPipeline::callback, ctx);
        } else {
-            struct InterpreterCtx : public SkRasterPipeline_CallbackCtx {
-                SkSL::ByteCode* byteCode;
-                SkSL::ByteCodeFunction* main;
-                const void* inputs;
-                int ninputs;
-            };
-            auto ctx = rec.fAlloc->make<InterpreterCtx>();
+            auto ctx = rec.fAlloc->make<SkRasterPipeline_InterpreterCtx>();
            ctx->inputs = fInputs->data();
            ctx->ninputs = fInputs->size() / 4;

@ -427,13 +421,8 @@ public:
                fByteCode = c.toByteCode(*prog);
            }
            ctx->byteCode = fByteCode.get();
-            ctx->main = ctx->byteCode->fFunctions[0].get();
-            ctx->fn = [](SkRasterPipeline_CallbackCtx* arg, int active_pixels) {
-                auto ctx = (InterpreterCtx*)arg;
-                ctx->byteCode->run(ctx->main, ctx->rgba, nullptr, active_pixels,
-                                   (float*)ctx->inputs, ctx->ninputs);
-            };
-            rec.fPipeline->append(SkRasterPipeline::callback, ctx);
+            ctx->fn = ctx->byteCode->fFunctions[0].get();
+            rec.fPipeline->append(SkRasterPipeline::interpreter, ctx);
        }
        return true;
    }
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@ -33,7 +33,7 @@
 */

 #define SK_RASTER_PIPELINE_STAGES(M)                               \
-    M(callback)                                                    \
+    M(callback) M(interpreter)                                     \
    M(move_src_dst) M(move_dst_src)                                \
    M(clamp_0) M(clamp_1) M(clamp_a) M(clamp_gamut)                \
    M(unpremul) M(premul) M(premul_dst)                            \
@ -150,6 +150,19 @@ struct SkRasterPipeline_CallbackCtx {
    float* read_from = rgba;
 };

+namespace SkSL {
+struct ByteCode;
+struct ByteCodeFunction;
+}
+
+struct SkRasterPipeline_InterpreterCtx {
+    SkSL::ByteCode*         byteCode;
+    SkSL::ByteCodeFunction* fn;
+
+    const void* inputs;
+    int         ninputs;
+};
+
 struct SkRasterPipeline_GradientCtx {
    size_t stopCount;
    float* fs[4];
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@ -10,6 +10,7 @@

 #include "include/core/SkTypes.h"
 #include "src/core/SkUtils.h"  // unaligned_{load,store}
+#include "src/sksl/SkSLByteCode.h"

 // Every function in this file should be marked static and inline using SI.
 #if defined(__clang__)
@ -2552,6 +2553,27 @@ STAGE(callback, SkRasterPipeline_CallbackCtx* c) {
    load4(c->read_from,0, &r,&g,&b,&a);
 }

+STAGE(interpreter, SkRasterPipeline_InterpreterCtx* c) {
+    float rr[N];
+    float gg[N];
+    float bb[N];
+    float aa[N];
+
+    sk_unaligned_store(rr, r);
+    sk_unaligned_store(gg, g);
+    sk_unaligned_store(bb, b);
+    sk_unaligned_store(aa, a);
+
+    float* args[] = { rr, gg, bb, aa };
+    c->byteCode->runStriped(c->fn, args, SK_ARRAY_COUNT(args), tail ? tail : N,
+                            (const float*)c->inputs, c->ninputs);
+
+    r = sk_unaligned_load<F>(rr);
+    g = sk_unaligned_load<F>(gg);
+    b = sk_unaligned_load<F>(bb);
+    a = sk_unaligned_load<F>(aa);
+}
+
 STAGE(gauss_a_to_rgba, Ctx::None) {
    // x = 1 - x;
    // exp(-x * x * 4) - 0.018f;
@ -3830,6 +3852,7 @@ STAGE_PP(swizzle, void* ctx) {
 // If a pipeline uses these stages, it'll boot it out of lowp into highp.
 #define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr;
    NOT_IMPLEMENTED(callback)
+    NOT_IMPLEMENTED(interpreter)
    NOT_IMPLEMENTED(unbounded_set_rgb)
    NOT_IMPLEMENTED(unbounded_uniform_color)
    NOT_IMPLEMENTED(unpremul)
--- a/src/sksl/SkSLByteCode.cpp
+++ b/src/sksl/SkSLByteCode.cpp
@ -1041,6 +1041,57 @@ void ByteCode::run(const ByteCodeFunction* f, float* args, float* outReturn, int
    }
 }

+void ByteCode::runStriped(const ByteCodeFunction* f, float* args[], int nargs, int N,
+                          const float* uniforms, int uniformCount) const {
+#ifdef TRACE
+    disassemble(f);
+#endif
+    Interpreter::VValue stack[128];
+
+    // Needs to be the first N non-negative integers, at least as large as VecWidth
+    static const Interpreter::I32 gLanes = {
+        0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+    };
+
+    SkASSERT(f->fReturnCount == 0);
+    SkASSERT(nargs == f->fParameterCount);
+    SkASSERT(uniformCount == (int)fInputSlots.size());
+    Interpreter::VValue globals[32];
+    SkASSERT((int)SK_ARRAY_COUNT(globals) >= fGlobalCount);
+    for (uint8_t slot : fInputSlots) {
+        globals[slot].fFloat = *uniforms++;
+    }
+
+    while (N) {
+        int w = std::min(N, Interpreter::VecWidth);
+
+        // Copy args into stack
+        for (int i = 0; i < nargs; ++i) {
+            memcpy(stack + i, args[i], w * sizeof(float));
+        }
+
+        auto mask = w > gLanes;
+        innerRun(this, f, stack, nullptr, mask, globals);
+
+        // Copy out parameters back
+        int slot = 0;
+        for (const auto& p : f->fParameters) {
+            if (p.fIsOutParameter) {
+                for (int i = slot; i < slot + p.fSlotCount; ++i) {
+                    memcpy(args[i], stack + i, w * sizeof(float));
+                }
+            }
+            slot += p.fSlotCount;
+        }
+
+        // Step each argument pointer ahead
+        for (int i = 0; i < nargs; ++i) {
+            args[i] += w;
+        }
+        N -= w;
+    }
+}
+
 } // namespace SkSL

 #endif
--- a/src/sksl/SkSLByteCode.h
+++ b/src/sksl/SkSLByteCode.h
@ -195,6 +195,9 @@ struct SK_API ByteCode {
     */
    void run(const ByteCodeFunction*, float* args, float* outReturn, int N,
             const float* uniforms, int uniformCount) const;
+
+    void runStriped(const ByteCodeFunction*, float* args[], int nargs, int N,
+                    const float* uniforms, int uniformCount) const;
 };

 }