Reland "Create looping binary-search gradient colorizer."

This is a reland of e2fa96ba4a Original change's description: > Create looping binary-search gradient colorizer. > > This allows us to dramatically increase the number of gradient stops > before falling back to sampling from a texture (which smears hardstops > and shows artifacts in extreme edge cases). The analytic colorizer > doesn't suffer from these artifacts and blurriness effects. > > In nanobench, this change comes at a performance penalty for some tests: > http://go/paste/6302350793768960 > > The texture path might have a bit of an unfair advantage here, if the > gradient texture can just be uploaded once and reused from the cache > repeatedly. Presumably the setup cost of texture generation and upload > is fairly expensive, but nanobench is testing just the steady-state > render performance. In comparison, the analytic colorizer doesn't have > a large setup cost. > > Change-Id: I71baa539a2c7f9e311ef8125de4ede2fdbf0c2d0 > Bug: skia:8401 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/457499 > Auto-Submit: John Stiles <johnstiles@google.com> > Commit-Queue: Michael Ludwig <michaelludwig@google.com> > Reviewed-by: Michael Ludwig <michaelludwig@google.com> Bug: skia:8401 Change-Id: I389f79909bc1424909481b06d70db285b55648fe Reviewed-on: https://skia-review.googlesource.com/c/skia/+/458277 Reviewed-by: Brian Osman <brianosman@google.com> Auto-Submit: John Stiles <johnstiles@google.com> Commit-Queue: John Stiles <johnstiles@google.com>
2021-10-11 20:06:09 -04:00 · 2021-10-11 20:06:09 -04:00 · c9f160b8dd
commit c9f160b8dd
parent 9a2adfec2c
1 changed files with 146 additions and 9 deletions
--- a/src/gpu/gradients/GrGradientShader.cpp
+++ b/src/gpu/gradients/GrGradientShader.cpp
@ -10,6 +10,7 @@
 #include "src/gpu/gradients/GrGradientBitmapCache.h"

 #include "include/gpu/GrRecordingContext.h"
+#include "src/core/SkMathPriv.h"
 #include "src/core/SkRuntimeEffectPriv.h"
 #include "src/gpu/GrCaps.h"
 #include "src/gpu/GrColor.h"
@ -128,7 +129,7 @@ static std::unique_ptr<GrFragmentProcessor> make_dual_interval_colorizer(const S
 // This works on ES2 hardware that doesn't support non-constant array indexes.
 // However, to keep code size under control, we are limited to a small number of stops.
 static constexpr int kMaxUnrolledColorCount    = 16;
-static constexpr int kMaxUnrolledIntervalCount = 8;
+static constexpr int kMaxUnrolledIntervalCount = kMaxUnrolledColorCount / 2;

 static std::unique_ptr<GrFragmentProcessor> make_unrolled_colorizer(int intervalCount,
                                                                    const SkPMColor4f* scale,
@ -238,6 +239,97 @@ static std::unique_ptr<GrFragmentProcessor> make_unrolled_colorizer(int interval
                          "bias", SkMakeSpan(bias, intervalCount));
 }

+// The "looping" colorizer uses a real loop to binary-search the array of gradient stops.
+static constexpr int kMaxLoopingColorCount    = 128;
+static constexpr int kMaxLoopingIntervalCount = kMaxLoopingColorCount / 2;
+
+static std::unique_ptr<GrFragmentProcessor> make_looping_colorizer(int intervalCount,
+                                                                   const SkPMColor4f* scale,
+                                                                   const SkPMColor4f* bias,
+                                                                   const SkScalar* thresholds) {
+    SkASSERT(intervalCount >= 1 && intervalCount <= kMaxLoopingIntervalCount);
+    SkASSERT((intervalCount & 3) == 0);  // intervals are required to come in groups of four
+    int intervalChunks = intervalCount / 4;
+    int cacheIndex = (size_t)intervalChunks - 1;
+
+    struct EffectCacheEntry {
+        SkOnce once;
+        sk_sp<SkRuntimeEffect> effect;
+    };
+
+    static EffectCacheEntry effectCache[kMaxLoopingIntervalCount / 4];
+    SkASSERT(cacheIndex >= 0 && cacheIndex < (int)SK_ARRAY_COUNT(effectCache));
+    EffectCacheEntry* cacheEntry = &effectCache[cacheIndex];
+
+    cacheEntry->once([intervalCount, intervalChunks, cacheEntry] {
+        SkString sksl;
+
+        // Binary search for the interval that `t` falls within. We can precalculate the number of
+        // loop iterations we need, and we know `t` will always be in range, so we can just loop a
+        // fixed number of times and can be guaranteed to have found the proper element.
+        //
+        // Threshold values are stored in half4s to keep them compact, so the last two rounds of
+        // binary search are hand-unrolled to allow them to use swizzles.
+        //
+        // Note that this colorizer is also designed to handle the case of exactly 4 intervals (a
+        // single chunk). In this case, the binary search for-loop will optimize away entirely, as
+        // it can be proven to execute zero times. We also optimize away the calculation of `4 *
+        // chunk` near the end via an @if statement, as the result will always be in chunk 0.
+        int loopCount = SkNextLog2(intervalChunks);
+        sksl.appendf(R"(
+        uniform half4 thresholds[%d];
+        uniform float4 scale[%d];
+        uniform float4 bias[%d];
+
+        half4 main(float2 coord) {
+            half t = half(coord.x);
+
+            // Choose a chunk from thresholds via binary search in a loop.
+            int low = 0;
+            int high = %d;
+            int chunk = %d;
+            for (int loop = 0; loop < %d; ++loop) {
+                if (t < thresholds[chunk].w) {
+                    high = chunk;
+                } else {
+                    low = chunk + 1;
+                }
+                chunk = (low + high) / 2;
+            }
+
+            // Choose the final position via explicit 4-way binary search.
+            int pos;
+            if (t < thresholds[chunk].y) {
+                pos = (t < thresholds[chunk].x) ? 0 : 1;
+            } else {
+                pos = (t < thresholds[chunk].z) ? 2 : 3;
+            }
+            @if (%d > 0) {
+                pos += 4 * chunk;
+            }
+            return t * scale[pos] + bias[pos];
+        }
+        )", /* thresholds: */ intervalChunks,
+            /* scale: */ intervalCount,
+            /* bias: */ intervalCount,
+            /* high: */ intervalChunks - 1,
+            /* chunk: */ (intervalChunks - 1) / 2,
+            /* loopCount: */ loopCount,
+            /* @if (loopCount > 0): */ loopCount);
+
+        auto result = SkRuntimeEffect::MakeForShader(std::move(sksl),
+                                                     SkRuntimeEffectPriv::ES3Options());
+        SkASSERTF(result.effect, "%s", result.errorText.c_str());
+        cacheEntry->effect = std::move(result.effect);
+    });
+
+    return GrSkSLFP::Make(cacheEntry->effect, "LoopingBinaryColorizer",
+                          /*inputFP=*/nullptr, GrSkSLFP::OptFlags::kNone,
+                          "thresholds", SkMakeSpan((const SkV4*)thresholds, intervalChunks),
+                          "scale", SkMakeSpan(scale, intervalCount),
+                          "bias", SkMakeSpan(bias, intervalCount));
+}
+
 // Converts an input array of {colors, positions} into an array of {scales, biases, thresholds}.
 // The length of the result array may differ from the input due to hard-stops or empty intervals.
 int build_intervals(int inputLength,
@ -305,6 +397,37 @@ static std::unique_ptr<GrFragmentProcessor> make_unrolled_binary_colorizer(
    return make_unrolled_colorizer(intervalCount, scales, biases, thresholds1_7, thresholds9_13);
 }

+static std::unique_ptr<GrFragmentProcessor> make_looping_binary_colorizer(const SkPMColor4f* colors,
+                                                                          const SkScalar* positions,
+                                                                          int count) {
+    if (count > kMaxLoopingColorCount) {
+        // Definitely cannot represent this gradient configuration
+        return nullptr;
+    }
+
+    SkPMColor4f scales[kMaxLoopingIntervalCount];
+    SkPMColor4f biases[kMaxLoopingIntervalCount];
+    SkScalar thresholds[kMaxLoopingIntervalCount] = {};
+    int intervalCount = build_intervals(count, colors, positions,
+                                        kMaxLoopingIntervalCount, scales, biases, thresholds);
+    if (intervalCount <= 0) {
+        return nullptr;
+    }
+
+    // We round up the number of intervals to the next power of two. This reduces the number of
+    // unique shaders and doesn't require any additional GPU processing power, but this does waste a
+    // handful of uniforms.
+    int roundedSize = std::max(4, SkNextPow2(intervalCount));
+    SkASSERT(roundedSize <= kMaxLoopingIntervalCount);
+    for (; intervalCount < roundedSize; ++intervalCount) {
+        thresholds[intervalCount] = thresholds[intervalCount - 1];
+        scales[intervalCount] = scales[intervalCount - 1];
+        biases[intervalCount] = biases[intervalCount - 1];
+    }
+
+    return make_looping_colorizer(intervalCount, scales, biases, thresholds);
+}
+
 // Analyze the shader's color stops and positions and chooses an appropriate colorizer to represent
 // the gradient.
 static std::unique_ptr<GrFragmentProcessor> make_colorizer(const SkPMColor4f* colors,
@ -376,15 +499,29 @@ static std::unique_ptr<GrFragmentProcessor> make_colorizer(const SkPMColor4f* co
        return nullptr;
    };

-    // Attempt to create an analytic colorizer.
-    if ((count <= kMaxUnrolledColorCount) && !intervalsExceedPrecisionLimit()) {
-        std::unique_ptr<GrFragmentProcessor> colorizer = makeDualIntervalColorizer();
-        if (colorizer) {
-            return colorizer;
+    if (caps->nonconstantArrayIndexSupport()) {
+        // Attempt to create an analytic colorizer that uses a binary-search loop.
+        if ((count <= kMaxLoopingColorCount) && !intervalsExceedPrecisionLimit()) {
+            std::unique_ptr<GrFragmentProcessor> colorizer = makeDualIntervalColorizer();
+            if (colorizer) {
+                return colorizer;
+            }
+            colorizer = make_looping_binary_colorizer(colors + offset, positions + offset, count);
+            if (colorizer) {
+                return colorizer;
+            }
        }
-        colorizer = make_unrolled_binary_colorizer(colors + offset, positions + offset, count);
-        if (colorizer) {
-            return colorizer;
+    } else {
+        // Attempt to create an analytic colorizer that conforms to ES2 loop limitations.
+        if ((count <= kMaxUnrolledColorCount) && !intervalsExceedPrecisionLimit()) {
+            std::unique_ptr<GrFragmentProcessor> colorizer = makeDualIntervalColorizer();
+            if (colorizer) {
+                return colorizer;
+            }
+            colorizer = make_unrolled_binary_colorizer(colors + offset, positions + offset, count);
+            if (colorizer) {
+                return colorizer;
+            }
        }
    }