add run_pipeline_obs

This is the simplest, one-big-switch version of running a pipeline. This "new" approach can't support injectable stages like the other style can, but in return we get more registers to work with (often many more), and more code is visible at once for the compiler to optimize together. skcms got both smaller and faster when we did this there. It seems easiest to stage this with both new and old approaches existing in parallel, and then remove the old code when things are working. Cq-Include-Trybots: luci.chromium.try:linux-blink-rel Change-Id: I34fbfd7ce46fad2a9a34d3aa446fbd84c3f14f8a Reviewed-on: https://skia-review.googlesource.com/c/179220 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Herb Derby <herb@google.com>
2018-12-19 10:05:03 -05:00 · 2018-12-19 10:05:03 -05:00 · 05bf9319ec
commit 05bf9319ec
parent 4948e5578a
8 changed files with 240 additions and 85 deletions
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@ -75,7 +75,6 @@ namespace SkOpts {
    DEFINE_DEFAULT(hash_fn);

    DEFINE_DEFAULT(S32_alpha_D32_filter_DX);
-
 #undef DEFINE_DEFAULT

 #define M(st) (StageFn)SK_OPTS_NS::st,
@ -92,6 +91,15 @@ namespace SkOpts {
        = SK_OPTS_NS::lowp::start_pipeline;
 #undef M

+    void (*run_pipeline_obs)(size_t,size_t, size_t,size_t,
+                             const SkRasterPipeline::StockStage*, int, void**)
+        = SK_OPTS_NS::run_pipeline_obs;
+    void (*run_pipeline_obs_lowp)(size_t,size_t, size_t,size_t,
+                                  const SkRasterPipeline::StockStage*, int, void**)
+        = SK_OPTS_NS::lowp::run_pipeline_obs;
+    bool (*can_run_pipeline_obs_lowp)(const SkRasterPipeline::StockStage*, int)
+        = SK_OPTS_NS::lowp::can_run_pipeline_obs;
+
    // Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
    void Init_ssse3();
    void Init_sse41();
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@ -67,6 +67,12 @@ namespace SkOpts {

    extern void (*start_pipeline_highp)(size_t,size_t,size_t,size_t, void**);
    extern void (*start_pipeline_lowp )(size_t,size_t,size_t,size_t, void**);
+
+    extern void (*run_pipeline_obs)(size_t,size_t, size_t,size_t,
+                                    const SkRasterPipeline::StockStage*, int, void**);
+    extern void (*run_pipeline_obs_lowp)(size_t,size_t, size_t,size_t,
+                                         const SkRasterPipeline::StockStage*, int, void**);
+    extern bool (*can_run_pipeline_obs_lowp)(const SkRasterPipeline::StockStage*, int);
 #undef M
 }

--- a/src/core/SkRasterPipeline.cpp
+++ b/src/core/SkRasterPipeline.cpp
@ -16,6 +16,10 @@ void SkRasterPipeline::reset() {
    fStages      = nullptr;
    fNumStages   = 0;
    fSlotsNeeded = 1;  // We always need one extra slot for just_return().
+
+    fStockStages.rewind();
+    fCtxPointers.rewind();
+    fCanUseRunProgramObs = false;   // flip to true to experiment with this feature
 }

 void SkRasterPipeline::append(StockStage stage, void* ctx) {
@ -30,11 +34,20 @@ void SkRasterPipeline::unchecked_append(StockStage stage, void* ctx) {
    fStages = fAlloc->make<StageList>( StageList{fStages, (uint64_t) stage, ctx, false} );
    fNumStages   += 1;
    fSlotsNeeded += ctx ? 2 : 1;
+
+    if (fCanUseRunProgramObs) {
+        fStockStages.push_back(stage);
+        if (ctx) {
+            fCtxPointers.push_back(ctx);
+        }
+    }
 }
 void SkRasterPipeline::append(void* fn, void* ctx) {
    fStages = fAlloc->make<StageList>( StageList{fStages, (uint64_t) fn, ctx, true} );
    fNumStages   += 1;
    fSlotsNeeded += ctx ? 2 : 1;
+
+    fCanUseRunProgramObs = false;
 }

 void SkRasterPipeline::extend(const SkRasterPipeline& src) {
@ -56,41 +69,24 @@ void SkRasterPipeline::extend(const SkRasterPipeline& src) {
    fStages = &stages[src.fNumStages - 1];
    fNumStages   += src.fNumStages;
    fSlotsNeeded += src.fSlotsNeeded - 1;  // Don't double count just_returns().
+
+    fStockStages.append(src.fStockStages.count(), src.fStockStages.begin());
+    fCtxPointers.append(src.fCtxPointers.count(), src.fCtxPointers.begin());
 }

 void SkRasterPipeline::dump() const {
    SkDebugf("SkRasterPipeline, %d stages\n", fNumStages);
-    std::vector<const char*> stages;
-    for (auto st = fStages; st; st = st->prev) {
-        const char* name = "";
-        switch (st->stage) {
-        #define M(x) case x: name = #x; break;
+
+    for (auto st : fStockStages) {
+        switch (st) {
+        #define M(st) case st: SkDebugf("\t%s\n", #st); break;
            SK_RASTER_PIPELINE_STAGES(M)
        #undef M
        }
-        stages.push_back(name);
-    }
-    std::reverse(stages.begin(), stages.end());
-    for (const char* name : stages) {
-        SkDebugf("\t%s\n", name);
    }
    SkDebugf("\n");
 }

-//#define TRACK_COLOR_HISTOGRAM
-#ifdef TRACK_COLOR_HISTOGRAM
-    static int gBlack;
-    static int gWhite;
-    static int gColor;
-    #define INC_BLACK   gBlack++
-    #define INC_WHITE   gWhite++
-    #define INC_COLOR   gColor++
-#else
-    #define INC_BLACK
-    #define INC_WHITE
-    #define INC_COLOR
-#endif
-
 void SkRasterPipeline::append_set_rgb(SkArenaAlloc* alloc, const float rgb[3]) {
    auto arg = alloc->makeArrayDefault<float>(3);
    arg[0] = rgb[0];
@ -114,10 +110,8 @@ void SkRasterPipeline::append_constant_color(SkArenaAlloc* alloc, const float rg

    if (rgba[0] == 0 && rgba[1] == 0 && rgba[2] == 0 && rgba[3] == 1) {
        this->append(black_color);
-        INC_BLACK;
    } else if (rgba[0] == 1 && rgba[1] == 1 && rgba[2] == 1 && rgba[3] == 1) {
        this->append(white_color);
-        INC_WHITE;
    } else {
        auto ctx = alloc->make<SkRasterPipeline_UniformColorCtx>();
        Sk4f color = Sk4f::Load(rgba);
@ -138,32 +132,11 @@ void SkRasterPipeline::append_constant_color(SkArenaAlloc* alloc, const float rg
        } else {
            this->unchecked_append(unbounded_uniform_color, ctx);
        }
-
-        INC_COLOR;
    }
-
-#ifdef TRACK_COLOR_HISTOGRAM
-    SkDebugf("B=%d W=%d C=%d\n", gBlack, gWhite, gColor);
-#endif
 }

-#undef INC_BLACK
-#undef INC_WHITE
-#undef INC_COLOR
-
-//static int gCounts[5] = { 0, 0, 0, 0, 0 };
-
 void SkRasterPipeline::append_matrix(SkArenaAlloc* alloc, const SkMatrix& matrix) {
    SkMatrix::TypeMask mt = matrix.getType();
-#if 0
-    if (mt > 4) mt = 4;
-    gCounts[mt] += 1;
-    SkDebugf("matrices: %d %d %d %d %d\n",
-             gCounts[0], gCounts[1], gCounts[2], gCounts[3], gCounts[4]);
-#endif
-
-    // Based on a histogram of skps, we determined the following special cases were common, more
-    // or fewer can be used if client behaviors change.

    if (mt == SkMatrix::kIdentity_Mask) {
        return;
@ -333,6 +306,22 @@ void SkRasterPipeline::run(size_t x, size_t y, size_t w, size_t h) const {
        return;
    }

+    if (fCanUseRunProgramObs) {
+        const auto& stages = fStockStages;
+        const auto& ctx    = fCtxPointers;
+
+        if (SkOpts::can_run_pipeline_obs_lowp(stages.begin(), stages.count())) {
+            SkOpts::run_pipeline_obs_lowp(x,y, x+w,y+h,
+                                          stages.begin(), stages.count(),
+                                          (void**)ctx.begin());
+        } else {
+            SkOpts::run_pipeline_obs(x,y, x+w,y+h,
+                                     stages.begin(), stages.count(),
+                                     (void**)ctx.begin());
+        }
+        return;
+    }
+
    // Best to not use fAlloc here... we can't bound how often run() will be called.
    SkAutoSTMalloc<64, void*> program(fSlotsNeeded);

@ -345,6 +334,23 @@ std::function<void(size_t, size_t, size_t, size_t)> SkRasterPipeline::compile()
        return [](size_t, size_t, size_t, size_t) {};
    }

+    if (fCanUseRunProgramObs) {
+        const auto& stages = fStockStages;
+        const auto& ctx = fCtxPointers;
+
+        if (SkOpts::can_run_pipeline_obs_lowp(stages.begin(), stages.count())) {
+            return [=](size_t x, size_t y, size_t w, size_t h) {
+                SkOpts::run_pipeline_obs_lowp(x,y, x+w,y+h,
+                                              stages.begin(), stages.count(), (void**)ctx.begin());
+            };
+        } else {
+            return [=](size_t x, size_t y, size_t w, size_t h) {
+                SkOpts::run_pipeline_obs(x,y, x+w,y+h,
+                                         stages.begin(), stages.count(), (void**)ctx.begin());
+            };
+        }
+    }
+
    void** program = fAlloc->makeArray<void*>(fSlotsNeeded);

    auto start_pipeline = this->build_pipeline(program + fSlotsNeeded);
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@ -12,10 +12,11 @@
 #include "SkColor.h"
 #include "SkImageInfo.h"
 #include "SkNx.h"
-#include "SkTArray.h"
+#include "SkTArray.h"  // TODO: unused
+#include "SkTDArray.h"
 #include "SkTypes.h"
 #include <functional>
-#include <vector>
+#include <vector>  // TODO: unused

 /**
 * SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline.
@ -254,10 +255,16 @@ private:

    void unchecked_append(StockStage, void*);

+    // Used by old single-program void** style execution.
    SkArenaAlloc* fAlloc;
    StageList*    fStages;
    int           fNumStages;
    int           fSlotsNeeded;
+
+    // Passed directly to SkOpts::run_program_obs().
+    SkTDArray<StockStage> fStockStages;
+    SkTDArray<void*>      fCtxPointers;
+    bool                  fCanUseRunProgramObs;
 };

 template <size_t bytes>
--- a/src/opts/SkOpts_avx.cpp
+++ b/src/opts/SkOpts_avx.cpp
@ -17,6 +17,10 @@ namespace SkOpts {
        memset32 = SK_OPTS_NS::memset32;
        memset64 = SK_OPTS_NS::memset64;

+        run_pipeline_obs          = SK_OPTS_NS::run_pipeline_obs;
+        run_pipeline_obs_lowp     = SK_OPTS_NS::lowp::run_pipeline_obs;
+        can_run_pipeline_obs_lowp = SK_OPTS_NS::lowp::can_run_pipeline_obs;
+
    #define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
        SK_RASTER_PIPELINE_STAGES(M)
        just_return_highp = (StageFn)SK_OPTS_NS::just_return;
--- a/src/opts/SkOpts_hsw.cpp
+++ b/src/opts/SkOpts_hsw.cpp
@ -13,6 +13,10 @@

 namespace SkOpts {
    void Init_hsw() {
+        run_pipeline_obs          = SK_OPTS_NS::run_pipeline_obs;
+        run_pipeline_obs_lowp     = SK_OPTS_NS::lowp::run_pipeline_obs;
+        can_run_pipeline_obs_lowp = SK_OPTS_NS::lowp::can_run_pipeline_obs;
+
    #define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
        SK_RASTER_PIPELINE_STAGES(M)
        just_return_highp = (StageFn)SK_OPTS_NS::just_return;
--- a/src/opts/SkOpts_sse41.cpp
+++ b/src/opts/SkOpts_sse41.cpp
@ -15,6 +15,10 @@ namespace SkOpts {
    void Init_sse41() {
        blit_row_s32a_opaque = sse41::blit_row_s32a_opaque;

+        run_pipeline_obs          = SK_OPTS_NS::run_pipeline_obs;
+        run_pipeline_obs_lowp     = SK_OPTS_NS::lowp::run_pipeline_obs;
+        can_run_pipeline_obs_lowp = SK_OPTS_NS::lowp::can_run_pipeline_obs;
+
    #define M(st) stages_highp[SkRasterPipeline::st] = (StageFn)SK_OPTS_NS::st;
        SK_RASTER_PIPELINE_STAGES(M)
        just_return_highp = (StageFn)SK_OPTS_NS::just_return;
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@ -2240,6 +2240,36 @@ STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
    }
 }

+static void run_pipeline_chunk(size_t x, size_t y, size_t tail,
+                               const SkRasterPipeline::StockStage* stages, int nstages,
+                               void** ctx) {
+    F  r = 0,  g = 0,  b = 0,  a = 0,
+      dr = 0, dg = 0, db = 0, da = 0;
+    for (int i = 0; i < nstages; i++) {
+        switch (stages[i]) {
+    #define CASE(st) \
+            case SkRasterPipeline::st: st##_k(Ctx{ctx}, x,y,tail, r,g,b,a, dr,dg,db,da); break;
+        SK_RASTER_PIPELINE_STAGES(CASE)
+    #undef CASE
+        }
+    }
+}
+
+static void run_pipeline_obs(size_t x0, size_t y0,
+                             size_t x1, size_t y1,
+                             const SkRasterPipeline::StockStage* stages, int nstages,
+                             void** ctx) {
+    for (size_t y = y0; y < y1; y++) {
+        size_t x = x0;
+        for (; x + N <= x1; x += N) {
+            run_pipeline_chunk(x,y,    0, stages,nstages,ctx);
+        }
+        if (size_t tail = x1 - x) {
+            run_pipeline_chunk(x,y, tail, stages,nstages,ctx);
+        }
+    }
+}
+
 namespace lowp {
 #if defined(JUMPER_IS_SCALAR) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE)
    // If we're not compiled by Clang, or otherwise switched into scalar mode (old Clang, manually),
@ -2252,6 +2282,10 @@ namespace lowp {

    static void start_pipeline(size_t,size_t,size_t,size_t, void**) {}

+    static bool can_run_pipeline_obs(const SkRasterPipeline::StockStage*, int) { return false; }
+    static void run_pipeline_obs(size_t,size_t, size_t,size_t,
+                                 const SkRasterPipeline::StockStage*, int, void**) {}
+
 #else  // We are compiling vector code with Clang... let's make some lowp stages!

 #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512)
@ -3270,45 +3304,127 @@ STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
    a = a + div255( da*inv(a) );
    store_8888_(ptr, tail, r,g,b,a);
 }
+
 // Now we'll add null stand-ins for stages we haven't implemented in lowp.
 // If a pipeline uses these stages, it'll boot it out of lowp into highp.
+#define NOT_IMPLEMENTED(st)             \
+    static void (*st)(void) = nullptr;  \
+    static void st##_k(Ctx::None, size_t,size_t,size_t, F,F, U16,U16,U16,U16, U16,U16,U16,U16) {}

-using NotImplemented = void(*)(void);
+    NOT_IMPLEMENTED(callback)
+    NOT_IMPLEMENTED(load_rgba)
+    NOT_IMPLEMENTED(store_rgba)
+    NOT_IMPLEMENTED(unbounded_set_rgb)
+    NOT_IMPLEMENTED(unbounded_uniform_color)
+    NOT_IMPLEMENTED(unpremul)
+    NOT_IMPLEMENTED(dither)
+    NOT_IMPLEMENTED(from_srgb)
+    NOT_IMPLEMENTED(to_srgb)
+    NOT_IMPLEMENTED(load_f16)
+    NOT_IMPLEMENTED(load_f16_dst)
+    NOT_IMPLEMENTED(store_f16)
+    NOT_IMPLEMENTED(gather_f16)
+    NOT_IMPLEMENTED(load_f32)
+    NOT_IMPLEMENTED(load_f32_dst)
+    NOT_IMPLEMENTED(store_f32)
+    NOT_IMPLEMENTED(gather_f32)
+    NOT_IMPLEMENTED(load_1010102)
+    NOT_IMPLEMENTED(load_1010102_dst)
+    NOT_IMPLEMENTED(store_1010102)
+    NOT_IMPLEMENTED(gather_1010102)
+    NOT_IMPLEMENTED(store_u16_be)
+    NOT_IMPLEMENTED(byte_tables)
+    NOT_IMPLEMENTED(colorburn)
+    NOT_IMPLEMENTED(colordodge)
+    NOT_IMPLEMENTED(softlight)
+    NOT_IMPLEMENTED(hue)
+    NOT_IMPLEMENTED(saturation)
+    NOT_IMPLEMENTED(color)
+    NOT_IMPLEMENTED(luminosity)
+    NOT_IMPLEMENTED(matrix_3x3)
+    NOT_IMPLEMENTED(matrix_3x4)
+    NOT_IMPLEMENTED(matrix_4x5)
+    NOT_IMPLEMENTED(matrix_4x3)
+    NOT_IMPLEMENTED(parametric)
+    NOT_IMPLEMENTED(gamma)
+    NOT_IMPLEMENTED(rgb_to_hsl)
+    NOT_IMPLEMENTED(hsl_to_rgb)
+    NOT_IMPLEMENTED(gauss_a_to_rgba)
+    NOT_IMPLEMENTED(mirror_x)
+    NOT_IMPLEMENTED(repeat_x)
+    NOT_IMPLEMENTED(mirror_y)
+    NOT_IMPLEMENTED(repeat_y)
+    NOT_IMPLEMENTED(negate_x)
+    NOT_IMPLEMENTED(bilerp_clamp_8888)
+    NOT_IMPLEMENTED(bilinear_nx)
+    NOT_IMPLEMENTED(bilinear_ny)
+    NOT_IMPLEMENTED(bilinear_px)
+    NOT_IMPLEMENTED(bilinear_py)
+    NOT_IMPLEMENTED(bicubic_n3x)
+    NOT_IMPLEMENTED(bicubic_n1x)
+    NOT_IMPLEMENTED(bicubic_p1x)
+    NOT_IMPLEMENTED(bicubic_p3x)
+    NOT_IMPLEMENTED(bicubic_n3y)
+    NOT_IMPLEMENTED(bicubic_n1y)
+    NOT_IMPLEMENTED(bicubic_p1y)
+    NOT_IMPLEMENTED(bicubic_p3y)
+    NOT_IMPLEMENTED(save_xy)
+    NOT_IMPLEMENTED(accumulate)
+    NOT_IMPLEMENTED(xy_to_2pt_conical_well_behaved)
+    NOT_IMPLEMENTED(xy_to_2pt_conical_strip)
+    NOT_IMPLEMENTED(xy_to_2pt_conical_focal_on_circle)
+    NOT_IMPLEMENTED(xy_to_2pt_conical_smaller)
+    NOT_IMPLEMENTED(xy_to_2pt_conical_greater)
+    NOT_IMPLEMENTED(alter_2pt_conical_compensate_focal)
+    NOT_IMPLEMENTED(alter_2pt_conical_unswap)
+    NOT_IMPLEMENTED(mask_2pt_conical_nan)
+    NOT_IMPLEMENTED(mask_2pt_conical_degenerates)
+    NOT_IMPLEMENTED(apply_vector_mask)
+#undef NOT_IMPLEMENTED

-static NotImplemented
-        callback, load_rgba, store_rgba,
-        unbounded_set_rgb, unbounded_uniform_color,
-        unpremul, dither,
-        from_srgb, from_srgb_dst, to_srgb,
-        load_f16    , load_f16_dst    , store_f16    , gather_f16,
-        load_f32    , load_f32_dst    , store_f32    , gather_f32,
-        load_1010102, load_1010102_dst, store_1010102, gather_1010102,
-        store_u16_be,
-        byte_tables,
-        colorburn, colordodge, softlight, hue, saturation, color, luminosity,
-        matrix_3x3, matrix_3x4, matrix_4x5, matrix_4x3,
-        parametric, gamma,
-        rgb_to_hsl, hsl_to_rgb,
-        gauss_a_to_rgba,
-        mirror_x, repeat_x,
-        mirror_y, repeat_y,
-        negate_x,
-        bilerp_clamp_8888,
-        bilinear_nx, bilinear_ny, bilinear_px, bilinear_py,
-        bicubic_n3x, bicubic_n1x, bicubic_p1x, bicubic_p3x,
-        bicubic_n3y, bicubic_n1y, bicubic_p1y, bicubic_p3y,
-        save_xy, accumulate,
-        xy_to_2pt_conical_well_behaved,
-        xy_to_2pt_conical_strip,
-        xy_to_2pt_conical_focal_on_circle,
-        xy_to_2pt_conical_smaller,
-        xy_to_2pt_conical_greater,
-        xy_to_2pt_conical_compensate_focal,
-        alter_2pt_conical_compensate_focal,
-        alter_2pt_conical_unswap,
-        mask_2pt_conical_nan,
-        mask_2pt_conical_degenerates,
-        apply_vector_mask;
+static bool can_run_pipeline_obs(const SkRasterPipeline::StockStage* stages, int nstages) {
+    for (int i = 0; i < nstages; i++) {
+        switch (stages[i]) {
+    #define CASE(st) case SkRasterPipeline::st: if (!st) { return false; }
+        SK_RASTER_PIPELINE_STAGES(CASE)
+    #undef CASE
+        }
+    }
+    return true;
+}
+
+static void run_pipeline_chunk(size_t dx, size_t dy, size_t tail,
+                               const SkRasterPipeline::StockStage* stages, int nstages,
+                               void** ctx) {
+    F x = 0,
+      y = 0;
+    U16  r = 0,  g = 0,  b = 0,  a = 0,
+        dr = 0, dg = 0, db = 0, da = 0;
+    for (int i = 0; i < nstages; i++) {
+        switch (stages[i]) {
+    #define CASE(st)                                                                            \
+            case SkRasterPipeline::st: st##_k(Ctx{ctx}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \
+                                       break;
+        SK_RASTER_PIPELINE_STAGES(CASE)
+    #undef CASE
+        }
+    }
+}
+
+static void run_pipeline_obs(size_t x0, size_t y0,
+                             size_t x1, size_t y1,
+                             const SkRasterPipeline::StockStage* stages, int nstages,
+                             void** ctx) {
+    for (size_t y = y0; y < y1; y++) {
+        size_t x = x0;
+        for (; x + N <= x1; x += N) {
+            run_pipeline_chunk(x,y,    0, stages,nstages,ctx);
+        }
+        if (size_t tail = x1 - x) {
+            run_pipeline_chunk(x,y, tail, stages,nstages,ctx);
+        }
+    }
+}

 #endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages
 }  // namespace lowp