Reland "Create looping binary-search gradient colorizer."

This is a reland of e2fa96ba4a

Original change's description:
> Create looping binary-search gradient colorizer.
>
> This allows us to dramatically increase the number of gradient stops
> before falling back to sampling from a texture (which smears hardstops
> and shows artifacts in extreme edge cases). The analytic colorizer
> doesn't suffer from these artifacts and blurriness effects.
>
> In nanobench, this change comes at a performance penalty for some tests:
> http://go/paste/6302350793768960
>
> The texture path might have a bit of an unfair advantage here, if the
> gradient texture can just be uploaded once and reused from the cache
> repeatedly.  Presumably the setup cost of texture generation and upload
> is fairly expensive, but nanobench is testing just the steady-state
> render performance. In comparison, the analytic colorizer doesn't have
> a large setup cost.
>
> Change-Id: I71baa539a2c7f9e311ef8125de4ede2fdbf0c2d0
> Bug: skia:8401
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/457499
> Auto-Submit: John Stiles <johnstiles@google.com>
> Commit-Queue: Michael Ludwig <michaelludwig@google.com>
> Reviewed-by: Michael Ludwig <michaelludwig@google.com>

Bug: skia:8401
Change-Id: I389f79909bc1424909481b06d70db285b55648fe
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/458277
Reviewed-by: Brian Osman <brianosman@google.com>
Auto-Submit: John Stiles <johnstiles@google.com>
Commit-Queue: John Stiles <johnstiles@google.com>
This commit is contained in:
John Stiles 2021-10-11 20:06:09 -04:00 committed by SkCQ
parent 9a2adfec2c
commit c9f160b8dd

View File

@ -10,6 +10,7 @@
#include "src/gpu/gradients/GrGradientBitmapCache.h"
#include "include/gpu/GrRecordingContext.h"
#include "src/core/SkMathPriv.h"
#include "src/core/SkRuntimeEffectPriv.h"
#include "src/gpu/GrCaps.h"
#include "src/gpu/GrColor.h"
@ -128,7 +129,7 @@ static std::unique_ptr<GrFragmentProcessor> make_dual_interval_colorizer(const S
// This works on ES2 hardware that doesn't support non-constant array indexes.
// However, to keep code size under control, we are limited to a small number of stops.
static constexpr int kMaxUnrolledColorCount = 16;
static constexpr int kMaxUnrolledIntervalCount = 8;
static constexpr int kMaxUnrolledIntervalCount = kMaxUnrolledColorCount / 2;
static std::unique_ptr<GrFragmentProcessor> make_unrolled_colorizer(int intervalCount,
const SkPMColor4f* scale,
@ -238,6 +239,97 @@ static std::unique_ptr<GrFragmentProcessor> make_unrolled_colorizer(int interval
"bias", SkMakeSpan(bias, intervalCount));
}
// The "looping" colorizer uses a real loop to binary-search the array of gradient stops.
static constexpr int kMaxLoopingColorCount = 128;
static constexpr int kMaxLoopingIntervalCount = kMaxLoopingColorCount / 2;
static std::unique_ptr<GrFragmentProcessor> make_looping_colorizer(int intervalCount,
const SkPMColor4f* scale,
const SkPMColor4f* bias,
const SkScalar* thresholds) {
SkASSERT(intervalCount >= 1 && intervalCount <= kMaxLoopingIntervalCount);
SkASSERT((intervalCount & 3) == 0); // intervals are required to come in groups of four
int intervalChunks = intervalCount / 4;
int cacheIndex = (size_t)intervalChunks - 1;
struct EffectCacheEntry {
SkOnce once;
sk_sp<SkRuntimeEffect> effect;
};
static EffectCacheEntry effectCache[kMaxLoopingIntervalCount / 4];
SkASSERT(cacheIndex >= 0 && cacheIndex < (int)SK_ARRAY_COUNT(effectCache));
EffectCacheEntry* cacheEntry = &effectCache[cacheIndex];
cacheEntry->once([intervalCount, intervalChunks, cacheEntry] {
SkString sksl;
// Binary search for the interval that `t` falls within. We can precalculate the number of
// loop iterations we need, and we know `t` will always be in range, so we can just loop a
// fixed number of times and can be guaranteed to have found the proper element.
//
// Threshold values are stored in half4s to keep them compact, so the last two rounds of
// binary search are hand-unrolled to allow them to use swizzles.
//
// Note that this colorizer is also designed to handle the case of exactly 4 intervals (a
// single chunk). In this case, the binary search for-loop will optimize away entirely, as
// it can be proven to execute zero times. We also optimize away the calculation of `4 *
// chunk` near the end via an @if statement, as the result will always be in chunk 0.
int loopCount = SkNextLog2(intervalChunks);
sksl.appendf(R"(
uniform half4 thresholds[%d];
uniform float4 scale[%d];
uniform float4 bias[%d];
half4 main(float2 coord) {
half t = half(coord.x);
// Choose a chunk from thresholds via binary search in a loop.
int low = 0;
int high = %d;
int chunk = %d;
for (int loop = 0; loop < %d; ++loop) {
if (t < thresholds[chunk].w) {
high = chunk;
} else {
low = chunk + 1;
}
chunk = (low + high) / 2;
}
// Choose the final position via explicit 4-way binary search.
int pos;
if (t < thresholds[chunk].y) {
pos = (t < thresholds[chunk].x) ? 0 : 1;
} else {
pos = (t < thresholds[chunk].z) ? 2 : 3;
}
@if (%d > 0) {
pos += 4 * chunk;
}
return t * scale[pos] + bias[pos];
}
)", /* thresholds: */ intervalChunks,
/* scale: */ intervalCount,
/* bias: */ intervalCount,
/* high: */ intervalChunks - 1,
/* chunk: */ (intervalChunks - 1) / 2,
/* loopCount: */ loopCount,
/* @if (loopCount > 0): */ loopCount);
auto result = SkRuntimeEffect::MakeForShader(std::move(sksl),
SkRuntimeEffectPriv::ES3Options());
SkASSERTF(result.effect, "%s", result.errorText.c_str());
cacheEntry->effect = std::move(result.effect);
});
return GrSkSLFP::Make(cacheEntry->effect, "LoopingBinaryColorizer",
/*inputFP=*/nullptr, GrSkSLFP::OptFlags::kNone,
"thresholds", SkMakeSpan((const SkV4*)thresholds, intervalChunks),
"scale", SkMakeSpan(scale, intervalCount),
"bias", SkMakeSpan(bias, intervalCount));
}
// Converts an input array of {colors, positions} into an array of {scales, biases, thresholds}.
// The length of the result array may differ from the input due to hard-stops or empty intervals.
int build_intervals(int inputLength,
@ -305,6 +397,37 @@ static std::unique_ptr<GrFragmentProcessor> make_unrolled_binary_colorizer(
return make_unrolled_colorizer(intervalCount, scales, biases, thresholds1_7, thresholds9_13);
}
static std::unique_ptr<GrFragmentProcessor> make_looping_binary_colorizer(const SkPMColor4f* colors,
const SkScalar* positions,
int count) {
if (count > kMaxLoopingColorCount) {
// Definitely cannot represent this gradient configuration
return nullptr;
}
SkPMColor4f scales[kMaxLoopingIntervalCount];
SkPMColor4f biases[kMaxLoopingIntervalCount];
SkScalar thresholds[kMaxLoopingIntervalCount] = {};
int intervalCount = build_intervals(count, colors, positions,
kMaxLoopingIntervalCount, scales, biases, thresholds);
if (intervalCount <= 0) {
return nullptr;
}
// We round up the number of intervals to the next power of two. This reduces the number of
// unique shaders and doesn't require any additional GPU processing power, but this does waste a
// handful of uniforms.
int roundedSize = std::max(4, SkNextPow2(intervalCount));
SkASSERT(roundedSize <= kMaxLoopingIntervalCount);
for (; intervalCount < roundedSize; ++intervalCount) {
thresholds[intervalCount] = thresholds[intervalCount - 1];
scales[intervalCount] = scales[intervalCount - 1];
biases[intervalCount] = biases[intervalCount - 1];
}
return make_looping_colorizer(intervalCount, scales, biases, thresholds);
}
// Analyze the shader's color stops and positions and chooses an appropriate colorizer to represent
// the gradient.
static std::unique_ptr<GrFragmentProcessor> make_colorizer(const SkPMColor4f* colors,
@ -376,15 +499,29 @@ static std::unique_ptr<GrFragmentProcessor> make_colorizer(const SkPMColor4f* co
return nullptr;
};
// Attempt to create an analytic colorizer.
if ((count <= kMaxUnrolledColorCount) && !intervalsExceedPrecisionLimit()) {
std::unique_ptr<GrFragmentProcessor> colorizer = makeDualIntervalColorizer();
if (colorizer) {
return colorizer;
if (caps->nonconstantArrayIndexSupport()) {
// Attempt to create an analytic colorizer that uses a binary-search loop.
if ((count <= kMaxLoopingColorCount) && !intervalsExceedPrecisionLimit()) {
std::unique_ptr<GrFragmentProcessor> colorizer = makeDualIntervalColorizer();
if (colorizer) {
return colorizer;
}
colorizer = make_looping_binary_colorizer(colors + offset, positions + offset, count);
if (colorizer) {
return colorizer;
}
}
colorizer = make_unrolled_binary_colorizer(colors + offset, positions + offset, count);
if (colorizer) {
return colorizer;
} else {
// Attempt to create an analytic colorizer that conforms to ES2 loop limitations.
if ((count <= kMaxUnrolledColorCount) && !intervalsExceedPrecisionLimit()) {
std::unique_ptr<GrFragmentProcessor> colorizer = makeDualIntervalColorizer();
if (colorizer) {
return colorizer;
}
colorizer = make_unrolled_binary_colorizer(colors + offset, positions + offset, count);
if (colorizer) {
return colorizer;
}
}
}