start on raster pipeline 2d mode

- Add run_2d(x,y,w,h) and start_pipeline_2d(). - Add and test a 2d-compatible store_8888_2d stage. Change-Id: Ib9c225d1b8cb40471ae4333df1d06eec4d506f8a Reviewed-on: https://skia-review.googlesource.com/24401 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Florin Malita <fmalita@chromium.org>
2017-07-18 11:30:25 -04:00 · 2017-07-18 11:30:25 -04:00 · 3b92b6907a
commit 3b92b6907a
parent 135e446b9f
8 changed files with 9581 additions and 7621 deletions
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@ -17,6 +17,7 @@
 #include <vector>

 struct SkJumper_constants;
+struct SkJumper_Engine;
 struct SkPM4f;

 /**
@ -91,8 +92,8 @@ struct SkPM4f;
    M(xy_to_2pt_conical_linear)                                  \
    M(mask_2pt_conical_degenerates) M(apply_vector_mask)         \
    M(byte_tables) M(byte_tables_rgb)                            \
-    M(rgb_to_hsl)                                                \
-    M(hsl_to_rgb)
+    M(rgb_to_hsl) M(hsl_to_rgb)                                  \
+    M(store_8888_2d)

 class SkRasterPipeline {
 public:
@ -120,6 +121,9 @@ public:
    // Runs the pipeline walking x through [x,x+n).
    void run(size_t x, size_t y, size_t n) const;

+    // Runs the pipeline in 2d from (x,y) inclusive to (x+w,y+h) exclusive.
+    void run_2d(size_t x, size_t y, size_t w, size_t h) const;
+
    // Allocates a thunk which amortizes run() setup cost in alloc.
    std::function<void(size_t, size_t, size_t)> compile() const;

@ -140,15 +144,13 @@ public:
    bool empty() const { return fStages == nullptr; }

 private:
-    using StartPipelineFn = void(size_t,size_t,size_t,void**,const SkJumper_constants*);
-
    struct StageList {
        StageList* prev;
        StockStage stage;
        void*      ctx;
    };

-    StartPipelineFn* build_pipeline(void**) const;
+    const SkJumper_Engine& build_pipeline(void**) const;
    void unchecked_append(StockStage, void*);

    SkArenaAlloc* fAlloc;
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@ -58,9 +58,10 @@ static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
 #endif

 // We can't express the real types of most stage functions portably, so we use a stand-in.
-// We'll only ever call start_pipeline(), which then chains into the rest for us.
-using StageFn = void(void);
-using StartPipelineFn = void(size_t,size_t,size_t,void**,K*);
+// We'll only ever call start_pipeline() or start_pipeline_2d(), which then chain into the rest.
+using StageFn           = void(void);
+using StartPipelineFn   = void(size_t,size_t,size_t,        void**,K*);
+using StartPipeline2dFn = void(size_t,size_t,size_t,size_t, void**,K*);

 // Some platforms expect C "name" maps to asm "_name", others to "name".
 #if defined(__APPLE__)
@ -106,14 +107,16 @@ extern "C" {
    // We'll just run portable code.

 #elif defined(__aarch64__)
-    StartPipelineFn ASM(start_pipeline,aarch64);
+    StartPipelineFn   ASM(start_pipeline   ,aarch64);
+    StartPipeline2dFn ASM(start_pipeline_2d,aarch64);
    StageFn ASM(just_return,aarch64);
    #define M(st) StageFn ASM(st,aarch64);
        SK_RASTER_PIPELINE_STAGES(M)
    #undef M

 #elif defined(__arm__)
-    StartPipelineFn ASM(start_pipeline,vfp4);
+    StartPipelineFn   ASM(start_pipeline   ,vfp4);
+    StartPipeline2dFn ASM(start_pipeline_2d,vfp4);
    StageFn ASM(just_return,vfp4);
    #define M(st) StageFn ASM(st,vfp4);
        SK_RASTER_PIPELINE_STAGES(M)
@ -127,6 +130,13 @@ extern "C" {
                    ASM(start_pipeline,hsw_lowp  ),
                    ASM(start_pipeline,ssse3_lowp);

+    StartPipeline2dFn ASM(start_pipeline_2d,hsw       ),
+                      ASM(start_pipeline_2d,avx       ),
+                      ASM(start_pipeline_2d,sse41     ),
+                      ASM(start_pipeline_2d,sse2      ),
+                      ASM(start_pipeline_2d,hsw_lowp  ),
+                      ASM(start_pipeline_2d,ssse3_lowp);
+
    StageFn ASM(just_return,hsw),
            ASM(just_return,avx),
            ASM(just_return,sse41),
@ -156,7 +166,8 @@ extern "C" {

 #elif (defined(__i386__) || defined(_M_IX86)) && \
        !(defined(_MSC_VER) && defined(SK_SUPPORT_LEGACY_WIN32_JUMPER))
-    StartPipelineFn ASM(start_pipeline,sse2);
+    StartPipelineFn   ASM(start_pipeline   ,sse2);
+    StartPipeline2dFn ASM(start_pipeline_2d,sse2);
    StageFn ASM(just_return,sse2);
    #define M(st) StageFn ASM(st,sse2);
        SK_RASTER_PIPELINE_STAGES(M)
@ -165,7 +176,8 @@ extern "C" {
 #endif

    // Portable, single-pixel stages.
-    StartPipelineFn sk_start_pipeline;
+    StartPipelineFn   sk_start_pipeline;
+    StartPipeline2dFn sk_start_pipeline_2d;
    StageFn sk_just_return;
    #define M(st) StageFn sk_##st;
        SK_RASTER_PIPELINE_STAGES(M)
@ -192,9 +204,10 @@ extern "C" {

 // Engines comprise everything we need to run SkRasterPipelines.
 struct SkJumper_Engine {
-    StageFn*         stages[kNumStages];
-    StartPipelineFn* start_pipeline;
-    StageFn*         just_return;
+    StageFn*           stages[kNumStages];
+    StartPipelineFn*   start_pipeline;
+    StartPipeline2dFn* start_pipeline_2d;
+    StageFn*           just_return;
 };

 // We'll default to this portable engine, but try to choose a better one at runtime.
@ -203,6 +216,7 @@ static const SkJumper_Engine kPortable = {
    { SK_RASTER_PIPELINE_STAGES(M) },
 #undef M
    sk_start_pipeline,
+    sk_start_pipeline_2d,
    sk_just_return,
 };
 static SkJumper_Engine gEngine = kPortable;
@ -216,7 +230,9 @@ static SkJumper_Engine choose_engine() {
    return {
    #define M(stage) ASM(stage, aarch64),
        { SK_RASTER_PIPELINE_STAGES(M) },
-        M(start_pipeline) M(just_return)
+        M(start_pipeline)
+        M(start_pipeline_2d)
+        M(just_return)
    #undef M
    };

@ -225,7 +241,9 @@ static SkJumper_Engine choose_engine() {
        return {
        #define M(stage) ASM(stage, vfp4),
            { SK_RASTER_PIPELINE_STAGES(M) },
-            M(start_pipeline) M(just_return)
+            M(start_pipeline)
+            M(start_pipeline_2d)
+            M(just_return)
        #undef M
        };
    }
@ -235,7 +253,9 @@ static SkJumper_Engine choose_engine() {
        return {
        #define M(stage) ASM(stage, hsw),
            { SK_RASTER_PIPELINE_STAGES(M) },
-            M(start_pipeline) M(just_return)
+            M(start_pipeline)
+            M(start_pipeline_2d)
+            M(just_return)
        #undef M
        };
    }
@ -243,7 +263,9 @@ static SkJumper_Engine choose_engine() {
        return {
        #define M(stage) ASM(stage, avx),
            { SK_RASTER_PIPELINE_STAGES(M) },
-            M(start_pipeline) M(just_return)
+            M(start_pipeline)
+            M(start_pipeline_2d)
+            M(just_return)
        #undef M
        };
    }
@ -251,7 +273,9 @@ static SkJumper_Engine choose_engine() {
        return {
        #define M(stage) ASM(stage, sse41),
            { SK_RASTER_PIPELINE_STAGES(M) },
-            M(start_pipeline) M(just_return)
+            M(start_pipeline)
+            M(start_pipeline_2d)
+            M(just_return)
        #undef M
        };
    }
@ -259,7 +283,9 @@ static SkJumper_Engine choose_engine() {
        return {
        #define M(stage) ASM(stage, sse2),
            { SK_RASTER_PIPELINE_STAGES(M) },
-            M(start_pipeline) M(just_return)
+            M(start_pipeline)
+            M(start_pipeline_2d)
+            M(just_return)
        #undef M
        };
    }
@ -270,7 +296,9 @@ static SkJumper_Engine choose_engine() {
        return {
        #define M(stage) ASM(stage, sse2),
            { SK_RASTER_PIPELINE_STAGES(M) },
-            M(start_pipeline) M(just_return)
+            M(start_pipeline)
+            M(start_pipeline_2d)
+            M(just_return)
        #undef M
        };
    }
@ -286,6 +314,7 @@ static SkJumper_Engine choose_engine() {
    #undef M
        nullptr,
        nullptr,
+        nullptr,
    };
    static SkJumper_Engine gLowp = kNone;
    static SkOnce gChooseLowpOnce;
@ -296,8 +325,9 @@ static SkJumper_Engine choose_engine() {
            return {
            #define M(st) hsw_lowp<SkRasterPipeline::st>(),
                { SK_RASTER_PIPELINE_STAGES(M) },
-                ASM(start_pipeline,hsw_lowp),
-                ASM(just_return,hsw_lowp)
+                ASM(start_pipeline   ,hsw_lowp),
+                ASM(start_pipeline_2d,hsw_lowp),
+                ASM(just_return      ,hsw_lowp)
            #undef M
            };
        }
@ -305,8 +335,9 @@ static SkJumper_Engine choose_engine() {
            return {
            #define M(st) ssse3_lowp<SkRasterPipeline::st>(),
                { SK_RASTER_PIPELINE_STAGES(M) },
-                ASM(start_pipeline,ssse3_lowp),
-                ASM(just_return,ssse3_lowp)
+                ASM(start_pipeline   ,ssse3_lowp),
+                ASM(start_pipeline_2d,ssse3_lowp),
+                ASM(just_return      ,ssse3_lowp)
            #undef M
            };
        }
@ -315,7 +346,7 @@ static SkJumper_Engine choose_engine() {
    }
 #endif

-StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
+const SkJumper_Engine& SkRasterPipeline::build_pipeline(void** ip) const {
 #ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
    gChooseLowpOnce([]{ gLowp = choose_lowp(); });

@ -338,7 +369,7 @@ StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
        }
    }
    if (ip != reset_point) {
-        return gLowp.start_pipeline;
+        return gLowp;
    }
 #endif

@ -353,7 +384,7 @@ StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
        }
        *--ip = (void*)gEngine.stages[st->stage];
    }
-    return gEngine.start_pipeline;
+    return gEngine;
 }

 void SkRasterPipeline::run(size_t x, size_t y, size_t n) const {
@ -364,8 +395,8 @@ void SkRasterPipeline::run(size_t x, size_t y, size_t n) const {
    // Best to not use fAlloc here... we can't bound how often run() will be called.
    SkAutoSTMalloc<64, void*> program(fSlotsNeeded);

-    auto start_pipeline = this->build_pipeline(program.get() + fSlotsNeeded);
-    start_pipeline(x,y,x+n, program.get(), &kConstants);
+    const SkJumper_Engine& engine = this->build_pipeline(program.get() + fSlotsNeeded);
+    engine.start_pipeline(x,y,x+n, program.get(), &kConstants);
 }

 std::function<void(size_t, size_t, size_t)> SkRasterPipeline::compile() const {
@ -374,9 +405,22 @@ std::function<void(size_t, size_t, size_t)> SkRasterPipeline::compile() const {
    }

    void** program = fAlloc->makeArray<void*>(fSlotsNeeded);
-    auto start_pipeline = this->build_pipeline(program + fSlotsNeeded);
+    const SkJumper_Engine& engine = this->build_pipeline(program + fSlotsNeeded);

+    auto start_pipeline = engine.start_pipeline;
    return [=](size_t x, size_t y, size_t n) {
        start_pipeline(x,y,x+n, program, &kConstants);
    };
 }
+
+void SkRasterPipeline::run_2d(size_t x, size_t y, size_t w, size_t h) const {
+    if (this->empty()) {
+        return;
+    }
+
+    // Like in run(), it's best to not use fAlloc here... we can't bound how often we'll be called.
+    SkAutoSTMalloc<64, void*> program(fSlotsNeeded);
+
+    const SkJumper_Engine& engine = this->build_pipeline(program.get() + fSlotsNeeded);
+    engine.start_pipeline_2d(x,y,x+w,y+h, program.get(), &kConstants);
+}
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@ -117,4 +117,9 @@ struct SkJumper_2PtConicalCtx {
             fDR;
 };

+struct SkJumper_PtrStride {
+    void*  ptr;
+    size_t stride;
+};
+
 #endif//SkJumper_DEFINED
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@ -68,6 +68,22 @@ extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** pr
    }
 }

+#if defined(JUMPER) && defined(__AVX__)
+    // We really want to make sure all paths go through this function's (implicit) vzeroupper.
+    // If they don't, we'll experience severe slowdowns when we first use SSE instructions again.
+    __attribute__((disable_tail_calls))
+#endif
+#if defined(JUMPER)
+    __attribute__((flatten))  // Force-inline the call to start_pipeline().
+#endif
+MAYBE_MSABI
+extern "C" void WRAP(start_pipeline_2d)(size_t x, size_t y, size_t xlimit, size_t ylimit,
+                                        void** program, K* k) {
+    for (; y < ylimit; y++) {
+        WRAP(start_pipeline)(x,y,xlimit, program, k);
+    }
+}
+
 #define STAGE(name)                                                                   \
    SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
                     F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);             \
@ -910,6 +926,17 @@ STAGE(store_8888) {
    store(ptr, px, tail);
 }

+STAGE(store_8888_2d) {
+    auto c = (const SkJumper_PtrStride*)ctx;
+    auto ptr = (uint32_t*)c->ptr + y*c->stride + x;
+
+    U32 px = round(r, 255.0f)
+           | round(g, 255.0f) <<  8
+           | round(b, 255.0f) << 16
+           | round(a, 255.0f) << 24;
+    store(ptr, px, tail);
+}
+
 STAGE(load_bgra) {
    auto ptr = *(const uint32_t**)ctx + x;
    from_8888(load<U32>(ptr, tail), &b,&g,&r,&a);
--- a/src/jumper/SkJumper_stages_lowp.cpp
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@ -81,6 +81,21 @@ extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** pr
        start(k,program,x,y,tail, v,v,v,v, v,v,v,v);
    }
 }
+
+#if defined(__AVX__)
+    // We really want to make sure all paths go through this function's (implicit) vzeroupper.
+    // If they don't, we'll experience severe slowdowns when we first use SSE instructions again.
+    __attribute__((disable_tail_calls))
+#endif
+__attribute__((flatten))  // Force-inline the call to start_pipeline().
+MAYBE_MSABI
+extern "C" void WRAP(start_pipeline_2d)(size_t x, size_t y, size_t xlimit, size_t ylimit,
+                                        void** program, K* k) {
+    for (; y < ylimit; y++) {
+        WRAP(start_pipeline)(x,y,xlimit, program, k);
+    }
+}
+
 extern "C" void WRAP(just_return)(K*, void**, size_t,size_t,size_t, F,F,F,F, F,F,F,F) {}

 #define STAGE(name)                                                                   \
--- a/tests/SkRasterPipelineTest.cpp
+++ b/tests/SkRasterPipelineTest.cpp
@ -8,6 +8,7 @@
 #include "Test.h"
 #include "SkHalf.h"
 #include "SkRasterPipeline.h"
+#include "../src/jumper/SkJumper.h"

 DEF_TEST(SkRasterPipeline, r) {
    // Build and run a simple pipeline to exercise SkRasterPipeline,
@ -230,3 +231,33 @@ DEF_TEST(SkRasterPipeline_lowp, r) {
        }
    }
 }
+
+DEF_TEST(SkRasterPipeline_2d, r) {
+    uint32_t rgba[2*2] = {0,0,0,0};
+
+    SkSTArenaAlloc<256> alloc;
+    SkRasterPipeline p(&alloc);
+
+    // Splat out the (2d) dst coordinates: (0.5,0.5), (1.5,0.5), (0.5,1.5), (1.5,1.5).
+    p.append(SkRasterPipeline::seed_shader);
+
+    // Scale down to [0,1] range to write out as bytes.
+    p.append_matrix(&alloc, SkMatrix::Concat(SkMatrix::MakeScale(0.5f),
+                                             SkMatrix::MakeTrans(-0.5f, -0.5f)));
+
+    // Write out to rgba, with row stride = 2 pixels.
+    SkJumper_PtrStride ctx = { rgba, 2 };
+    p.append(SkRasterPipeline::store_8888_2d, &ctx);
+
+    p.run_2d(0,0, 2,2);
+
+    REPORTER_ASSERT(r, ((rgba[0] >> 0) & 0xff) ==   0);
+    REPORTER_ASSERT(r, ((rgba[1] >> 0) & 0xff) == 128);
+    REPORTER_ASSERT(r, ((rgba[2] >> 0) & 0xff) ==   0);
+    REPORTER_ASSERT(r, ((rgba[3] >> 0) & 0xff) == 128);
+
+    REPORTER_ASSERT(r, ((rgba[0] >> 8) & 0xff) ==   0);
+    REPORTER_ASSERT(r, ((rgba[1] >> 8) & 0xff) ==   0);
+    REPORTER_ASSERT(r, ((rgba[2] >> 8) & 0xff) == 128);
+    REPORTER_ASSERT(r, ((rgba[3] >> 8) & 0xff) == 128);
+}