attempt 2: add experimental bilerp_clamp_8888 stage

It looks like we can specialize hot image shaders into their own single stages for a good speedup on both x86 and ARM. I've started here with bilerp_clamp_8888, and will follow up with bgra and 565, and lowp versions of those, and probably also the same for nearest neighbors. All pixels are identical in GMs. Change-Id: Ib5ed6e528efd9e3eed96ba67d02fbec2e8133a81 Reviewed-on: https://skia-review.googlesource.com/86860 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
2017-12-11 09:59:47 -05:00 · 2017-12-11 09:59:47 -05:00 · 8a64e52a98
commit 8a64e52a98
parent f226e66d75
7 changed files with 11151 additions and 9242 deletions
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@ -53,6 +53,7 @@ struct SkJumper_Engine;
    M(load_f32)  M(load_f32_dst)  M(store_f32)                     \
    M(load_8888) M(load_8888_dst) M(store_8888) M(gather_8888)     \
    M(load_bgra) M(load_bgra_dst) M(store_bgra) M(gather_bgra)     \
+    M(bilerp_clamp_8888)                                           \
    M(load_u16_be) M(load_rgb_u16_be) M(store_u16_be)              \
    M(load_tables_u16_be) M(load_tables_rgb_u16_be) M(load_tables) \
    M(load_rgba) M(store_rgba)                                     \
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@ -219,6 +219,7 @@ extern "C" {
    NOPE(load_f32)  NOPE(load_f32_dst)  NOPE(store_f32)
    LOWP(load_8888) LOWP(load_8888_dst) LOWP(store_8888) LOWP(gather_8888)
    LOWP(load_bgra) LOWP(load_bgra_dst) LOWP(store_bgra) LOWP(gather_bgra)
+    TODO(bilerp_clamp_8888)
    TODO(load_u16_be) TODO(load_rgb_u16_be) TODO(store_u16_be)
    NOPE(load_tables_u16_be) NOPE(load_tables_rgb_u16_be) NOPE(load_tables)
    NOPE(load_rgba) NOPE(store_rgba)
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@ -48,10 +48,10 @@ struct SkJumper_MemoryCtx {
 };

 struct SkJumper_GatherCtx {
-    void* pixels;
-    int   stride;
-    float width,
-          height;
+    const void* pixels;
+    int         stride;
+    float       width;
+    float       height;
 };

 // State shared by save_xy, accumulate, and bilinear_* / bicubic_*.
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@ -194,13 +194,15 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int dx, int dy) {
    return (T*)ctx->pixels + dy*ctx->stride + dx;
 }

+// clamp v to [0,limit).
+SI F clamp(F v, F limit) {
+    F inclusive = bit_cast<F>( bit_cast<U32>(limit) - 1 );  // Exclusive -> inclusive.
+    return min(max(0, v), inclusive);
+}
+
 // Used by gather_ stages to calculate the base pointer and a vector of indices to load.
 template <typename T>
 SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
-    auto clamp = [](F v, F limit) {
-        limit = bit_cast<F>( bit_cast<U32>(limit) - 1 );  // Exclusive -> inclusive.
-        return min(max(0, v), limit);
-    };
    x = clamp(x, ctx->width);
    y = clamp(y, ctx->height);

@ -1521,3 +1523,47 @@ STAGE(gauss_a_to_rgba, Ctx::None) {
    g = a;
    b = a;
 }
+
+// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
+STAGE(bilerp_clamp_8888, SkJumper_GatherCtx* ctx) {
+    // (cx,cy) are the center of our sample.
+    F cx = r,
+      cy = g;
+
+    // All sample points are at the same fractional offset (fx,fy).
+    // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
+    F fx = fract(cx + 0.5f),
+      fy = fract(cy + 0.5f);
+
+    // We'll accumulate the color of all four samples into {r,g,b,a} directly.
+    r = g = b = a = 0;
+
+    float offsets[] = {-0.5f,+0.5f};
+
+    for (float dy : offsets)
+    for (float dx : offsets) {
+        // (x,y) are the coordinates of this sample point.
+        F x = cx + dx,
+          y = cy + dy;
+
+        // ix_and_ptr() will clamp to the image's bounds for us.
+        const uint32_t* ptr;
+        U32 ix = ix_and_ptr(&ptr, ctx, x,y);
+
+        F sr,sg,sb,sa;
+        from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
+
+        // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
+        // are combined in direct proportion to their area overlapping that logical query pixel.
+        // At positive offsets, the x-axis contribution to that rectangle is fx,
+        // or (1-fx) at negative x.  Same deal for y.
+        F sx = (dx > 0) ? fx : 1.0f - fx,
+          sy = (dy > 0) ? fy : 1.0f - fy,
+          area = sx * sy;
+
+        r += sr * area;
+        g += sg * area;
+        b += sb * area;
+        a += sa * area;
+    }
+}
--- a/src/shaders/SkImageShader.cpp
+++ b/src/shaders/SkImageShader.cpp
@ -313,7 +313,7 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
    p->append_matrix(alloc, matrix);

    auto gather = alloc->make<SkJumper_GatherCtx>();
-    gather->pixels = pm.writable_addr();  // Don't worry, we won't write to it.
+    gather->pixels = pm.addr();
    gather->stride = pm.rowBytesAsPixels();
    gather->width  = pm.width();
    gather->height = pm.height();
@ -325,6 +325,8 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
    limit_y->scale = pm.height();
    limit_y->invScale = 1.0f / pm.height();

+    bool is_srgb = rec.fDstCS && (!info.colorSpace() || info.gammaCloseToSRGB());
+
    auto append_tiling_and_gather = [&] {
        switch (fTileModeX) {
            case kClamp_TileMode:  /* The gather_xxx stage will clamp for us. */   break;
@ -346,11 +348,38 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
            case kRGBA_F16_SkColorType:  p->append(SkRasterPipeline::gather_f16,  gather); break;
            default: SkASSERT(false);
        }
-        if (rec.fDstCS && (!info.colorSpace() || info.gammaCloseToSRGB())) {
+        if (is_srgb) {
            p->append(SkRasterPipeline::from_srgb);
        }
    };

+    auto append_misc = [&] {
+        if (info.colorType() == kAlpha_8_SkColorType) {
+            p->append(SkRasterPipeline::set_rgb, &misc->paint_color);
+        }
+        if (info.colorType() == kAlpha_8_SkColorType ||
+            info.alphaType() == kUnpremul_SkAlphaType) {
+            p->append(SkRasterPipeline::premul);
+        }
+        if (quality > kLow_SkFilterQuality) {
+            // Bicubic filtering naturally produces out of range values on both sides.
+            p->append(SkRasterPipeline::clamp_0);
+            p->append(SkRasterPipeline::clamp_a);
+        }
+        append_gamut_transform(p, alloc, info.colorSpace(), rec.fDstCS, kPremul_SkAlphaType);
+        return true;
+    };
+
+    if (quality == kLow_SkFilterQuality            &&
+        info.colorType() == kRGBA_8888_SkColorType &&
+        fTileModeX == SkShader::kClamp_TileMode    &&
+        fTileModeY == SkShader::kClamp_TileMode    &&
+        !is_srgb) {
+
+        p->append(SkRasterPipeline::bilerp_clamp_8888, gather);
+        return append_misc();
+    }
+
    SkJumper_SamplerCtx* sampler = nullptr;
    if (quality != kNone_SkFilterQuality) {
        sampler = alloc->make<SkJumper_SamplerCtx>();
@ -366,6 +395,7 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {

    if (quality == kNone_SkFilterQuality) {
        append_tiling_and_gather();
+
    } else if (quality == kLow_SkFilterQuality) {
        p->append(SkRasterPipeline::save_xy, sampler);

@ -375,6 +405,7 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
        sample(SkRasterPipeline::bilinear_px, SkRasterPipeline::bilinear_py);

        p->append(SkRasterPipeline::move_dst_src);
+
    } else {
        p->append(SkRasterPipeline::save_xy, sampler);

@ -401,17 +432,5 @@ bool SkImageShader::onAppendStages(const StageRec& rec) const {
        p->append(SkRasterPipeline::move_dst_src);
    }

-    if (info.colorType() == kAlpha_8_SkColorType) {
-        p->append(SkRasterPipeline::set_rgb, &misc->paint_color);
-    }
-    if (info.colorType() == kAlpha_8_SkColorType || info.alphaType() == kUnpremul_SkAlphaType) {
-        p->append(SkRasterPipeline::premul);
-    }
-    if (quality > kLow_SkFilterQuality) {
-        // Bicubic filtering naturally produces out of range values on both sides.
-        p->append(SkRasterPipeline::clamp_0);
-        p->append(SkRasterPipeline::clamp_a);
-    }
-    append_gamut_transform(p, alloc, info.colorSpace(), rec.fDstCS, kPremul_SkAlphaType);
-    return true;
+    return append_misc();
 }