Evenly space gradient stage.

This seems like an experiment at this point because I don't know how to do this kind of thing on arm. Numbers from Skylake... Before: ./out/Release/nanobench --config srgb \ --match gradient_linear_clamp_3color gradient_linear_clamp_hicolor -q 19:48:13 Timer overhead: 36.7ns ! -> high variance, ? -> moderate variance micros bench 439.92 ? gradient_linear_clamp_3color srgb 2697.60 gradient_linear_clamp_hicolor srgb 437.28 gradient_linear_clamp_3color_4f srgb 2700.50 gradient_linear_clamp_hicolor_4f srgb After: micros bench 382.35 gradient_linear_clamp_3color srgb 593.49 gradient_linear_clamp_hicolor srgb 382.36 gradient_linear_clamp_3color_4f srgb 565.60 gradient_linear_clamp_hicolor_4f srgb Numbers on my Mac Trashcan are about even; there is no speedup or slowdown between master and this change. Change-Id: I04402452e23c0888512362fd1d6d5436cea61719 Reviewed-on: https://skia-review.googlesource.com/15960 Commit-Queue: Herb Derby <herb@google.com> Reviewed-by: Mike Klein <mtklein@chromium.org>
2017-05-11 16:54:23 -04:00 · 2017-05-11 16:54:23 -04:00 · 892501d09b
commit 892501d09b
parent d95236dab0
5 changed files with 3850 additions and 2750 deletions
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@ -96,6 +96,7 @@
    M(bicubic_n3x) M(bicubic_n1x) M(bicubic_p1x) M(bicubic_p3x)  \
    M(bicubic_n3y) M(bicubic_n1y) M(bicubic_p1y) M(bicubic_p3y)  \
    M(save_xy) M(accumulate)                                     \
+    M(evenly_spaced_linear_gradient)                             \
    M(linear_gradient)                                           \
    M(linear_gradient_2stops)                                    \
    M(xy_to_polar_unit)                                          \
--- a/src/effects/gradients/SkGradientShader.cpp
+++ b/src/effects/gradients/SkGradientShader.cpp
@ -5,6 +5,7 @@
 * found in the LICENSE file.
 */

+#include <algorithm>
 #include "Sk4fLinearGradient.h"
 #include "SkColorSpace_XYZ.h"
 #include "SkGradientShaderPriv.h"
@ -406,51 +407,74 @@ bool SkGradientShaderBase::onAppendStages(SkRasterPipeline* p,

        p->append(SkRasterPipeline::linear_gradient_2stops, f_and_b);
    } else {
-
-        struct Stop { float t; SkPM4f f, b; };
-        struct Ctx { size_t n; Stop* stops; SkPM4f start; };
-
-        auto* ctx = alloc->make<Ctx>();
-        ctx->start = prepareColor(0);
-
-        // For each stop we calculate a bias B and a scale factor F, such that
-        // for any t between stops n and n+1, the color we want is B[n] + F[n]*t.
-        auto init_stop = [](float t_l, float t_r, SkPM4f c_l, SkPM4f c_r, Stop *stop) {
-            auto F = SkPM4f::From4f((c_r.to4f() - c_l.to4f()) / (t_r - t_l));
-            auto B = SkPM4f::From4f(c_l.to4f() - (F.to4f() * t_l));
-            *stop = {t_l, F, B};
-        };
-
        if (fOrigPos == nullptr) {
            // Handle evenly distributed stops.

-            float dt = 1.0f / (fColorCount - 1);
+            struct Ctx {
+                size_t stopCount;
+                float* fs[4];
+                float* bs[4];
+            };
+
+            auto* ctx = alloc->make<Ctx>();
+            int stopCount = fColorCount;
+            float gapCount = stopCount - 1;
+
            // In the evenly distributed case, fColorCount is the number of stops. There are no
-            // dummy entries.
-            auto* stopsArray = alloc->makeArrayDefault<Stop>(fColorCount);
+            // dummy entries. So, there are fColorCount - 1 FBs.
+            for (int i = 0; i < 4; i++) {

-            float  t_l = 0;
-            SkPM4f c_l = ctx->start;
+                // Pad up to 8 in case we hit the AVX2 special case.
+                ctx->fs[i] = alloc->makeArray<float>(std::max(stopCount, 8));
+                ctx->bs[i] = alloc->makeArray<float>(std::max(stopCount, 8));
+            }
+
+            auto add_stop = [&](int stop, SkPM4f Fs, SkPM4f Bs) {
+                (ctx->fs[0])[stop] = Fs.r();
+                (ctx->fs[1])[stop] = Fs.g();
+                (ctx->fs[2])[stop] = Fs.b();
+                (ctx->fs[3])[stop] = Fs.a();
+                (ctx->bs[0])[stop] = Bs.r();
+                (ctx->bs[1])[stop] = Bs.g();
+                (ctx->bs[2])[stop] = Bs.b();
+                (ctx->bs[3])[stop] = Bs.a();
+            };
+            auto init_stop = [&](int stop, SkPM4f c_l, SkPM4f c_r) {
+                auto Fs = SkPM4f::From4f((c_r.to4f() - c_l.to4f()) * gapCount);
+                auto Bs = SkPM4f::From4f(c_l.to4f() - (Fs.to4f() * (stop / gapCount)));
+                add_stop(stop, Fs, Bs);
+            };
+
+            SkPM4f c_l = prepareColor(0);
            for (int i = 0; i < fColorCount - 1; i++) {
-                // Use multiply instead of accumulating error using repeated addition.
-                float  t_r = (i + 1) * dt;
                SkPM4f c_r = prepareColor(i + 1);
-                init_stop(t_l, t_r, c_l, c_r, &stopsArray[i]);
-
-                t_l = t_r;
+                init_stop(i, c_l, c_r);
                c_l = c_r;
            }

-            // Force the last stop.
-            stopsArray[fColorCount - 1].t = 1;
-            stopsArray[fColorCount - 1].f = SkPM4f::From4f(Sk4f{0});
-            stopsArray[fColorCount - 1].b = prepareColor(fColorCount - 1);
+            // Add the last stop.
+            add_stop(stopCount - 1, SkPM4f::FromPremulRGBA(0,0,0,0), c_l);

-            ctx->n = fColorCount;
-            ctx->stops = stopsArray;
+            ctx->stopCount = stopCount;
+
+            p->append(SkRasterPipeline::evenly_spaced_linear_gradient, ctx);
        } else {
            // Handle arbitrary stops.

+            struct Stop { float t; SkPM4f f, b; };
+            struct Ctx { size_t n; Stop* stops; SkPM4f start; };
+
+            auto* ctx = alloc->make<Ctx>();
+            ctx->start = prepareColor(0);
+
+            // For each stop we calculate a bias B and a scale factor F, such that
+            // for any t between stops n and n+1, the color we want is B[n] + F[n]*t.
+            auto init_stop = [](float t_l, float t_r, SkPM4f c_l, SkPM4f c_r, Stop *stop) {
+                auto F = SkPM4f::From4f((c_r.to4f() - c_l.to4f()) / (t_r - t_l));
+                auto B = SkPM4f::From4f(c_l.to4f() - (F.to4f() * t_l));
+                *stop = {t_l, F, B};
+            };
+
            // Remove the dummy stops inserted by SkGradientShaderBase::SkGradientShaderBase
            // because they are naturally handled by the search method.
            int firstStop;
@ -491,9 +515,8 @@ bool SkGradientShaderBase::onAppendStages(SkRasterPipeline* p,

            ctx->n = stopCount;
            ctx->stops = stopsArray;
+            p->append(SkRasterPipeline::linear_gradient, ctx);
        }
-
-        p->append(SkRasterPipeline::linear_gradient, ctx);
    }

    if (!premulGrad && !this->colorsAreOpaque()) {
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@ -1034,6 +1034,52 @@ STAGE(matrix_perspective) {
    g = G * rcp(Z);
 }

+STAGE(evenly_spaced_linear_gradient) {
+    struct Ctx {
+        size_t stopCount;
+        float* fs[4];
+        float* bs[4];
+    };
+
+    auto c = (const Ctx*)ctx;
+    auto t = r;
+    auto i = trunc_(t*(c->stopCount - 1));
+
+#if defined(JUMPER) && defined(__AVX2__)
+    if (c->stopCount <=8) {
+        auto fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), i);
+        auto br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), i);
+        auto fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), i);
+        auto bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), i);
+        auto fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), i);
+        auto bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), i);
+        auto fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), i);
+        auto ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), i);
+        r = mad(t, fr, br);
+        g = mad(t, fg, bg);
+        b = mad(t, fb, bb);
+        a = mad(t, fa, ba);
+
+    } else
+#endif
+    {
+        auto fr = gather(c->fs[0], i);
+        auto br = gather(c->bs[0], i);
+        auto fg = gather(c->fs[1], i);
+        auto bg = gather(c->bs[1], i);
+        auto fb = gather(c->fs[2], i);
+        auto bb = gather(c->bs[2], i);
+        auto fa = gather(c->fs[3], i);
+        auto ba = gather(c->bs[3], i);
+
+        r = mad(t, fr, br);
+        g = mad(t, fg, bg);
+        b = mad(t, fb, bb);
+        a = mad(t, fa, ba);
+    }
+
+}
+
 STAGE(linear_gradient) {
    struct Stop { float pos; float f[4], b[4]; };
    struct Ctx { size_t n; Stop *stops; float start[4]; };