Evenly space gradient stage.

This seems like an experiment at this point because I don't know how to do
this kind of thing on arm.


Numbers from Skylake...
Before:
./out/Release/nanobench --config srgb \
--match gradient_linear_clamp_3color gradient_linear_clamp_hicolor -q                                                                        19:48:13
Timer overhead: 36.7ns
! -> high variance, ? -> moderate variance
    micros      bench
    439.92 ?    gradient_linear_clamp_3color    srgb
   2697.60      gradient_linear_clamp_hicolor   srgb
    437.28      gradient_linear_clamp_3color_4f srgb
   2700.50      gradient_linear_clamp_hicolor_4f        srgb


After:
   micros      bench
    382.35      gradient_linear_clamp_3color    srgb
    593.49      gradient_linear_clamp_hicolor   srgb
    382.36      gradient_linear_clamp_3color_4f srgb
    565.60      gradient_linear_clamp_hicolor_4f        srgb


Numbers on my Mac Trashcan are about even; there is no 
speedup or slowdown between master and this change.

Change-Id: I04402452e23c0888512362fd1d6d5436cea61719
Reviewed-on: https://skia-review.googlesource.com/15960
Commit-Queue: Herb Derby <herb@google.com>
Reviewed-by: Mike Klein <mtklein@chromium.org>
This commit is contained in:
herb 2017-05-11 16:54:23 -04:00 committed by Skia Commit-Bot
parent d95236dab0
commit 892501d09b
5 changed files with 3850 additions and 2750 deletions

View File

@ -96,6 +96,7 @@
M(bicubic_n3x) M(bicubic_n1x) M(bicubic_p1x) M(bicubic_p3x) \
M(bicubic_n3y) M(bicubic_n1y) M(bicubic_p1y) M(bicubic_p3y) \
M(save_xy) M(accumulate) \
M(evenly_spaced_linear_gradient) \
M(linear_gradient) \
M(linear_gradient_2stops) \
M(xy_to_polar_unit) \

View File

@ -5,6 +5,7 @@
* found in the LICENSE file.
*/
#include <algorithm>
#include "Sk4fLinearGradient.h"
#include "SkColorSpace_XYZ.h"
#include "SkGradientShaderPriv.h"
@ -406,51 +407,74 @@ bool SkGradientShaderBase::onAppendStages(SkRasterPipeline* p,
p->append(SkRasterPipeline::linear_gradient_2stops, f_and_b);
} else {
struct Stop { float t; SkPM4f f, b; };
struct Ctx { size_t n; Stop* stops; SkPM4f start; };
auto* ctx = alloc->make<Ctx>();
ctx->start = prepareColor(0);
// For each stop we calculate a bias B and a scale factor F, such that
// for any t between stops n and n+1, the color we want is B[n] + F[n]*t.
auto init_stop = [](float t_l, float t_r, SkPM4f c_l, SkPM4f c_r, Stop *stop) {
auto F = SkPM4f::From4f((c_r.to4f() - c_l.to4f()) / (t_r - t_l));
auto B = SkPM4f::From4f(c_l.to4f() - (F.to4f() * t_l));
*stop = {t_l, F, B};
};
if (fOrigPos == nullptr) {
// Handle evenly distributed stops.
float dt = 1.0f / (fColorCount - 1);
struct Ctx {
size_t stopCount;
float* fs[4];
float* bs[4];
};
auto* ctx = alloc->make<Ctx>();
int stopCount = fColorCount;
float gapCount = stopCount - 1;
// In the evenly distributed case, fColorCount is the number of stops. There are no
// dummy entries.
auto* stopsArray = alloc->makeArrayDefault<Stop>(fColorCount);
// dummy entries. So, there are fColorCount - 1 FBs.
for (int i = 0; i < 4; i++) {
float t_l = 0;
SkPM4f c_l = ctx->start;
// Pad up to 8 in case we hit the AVX2 special case.
ctx->fs[i] = alloc->makeArray<float>(std::max(stopCount, 8));
ctx->bs[i] = alloc->makeArray<float>(std::max(stopCount, 8));
}
auto add_stop = [&](int stop, SkPM4f Fs, SkPM4f Bs) {
(ctx->fs[0])[stop] = Fs.r();
(ctx->fs[1])[stop] = Fs.g();
(ctx->fs[2])[stop] = Fs.b();
(ctx->fs[3])[stop] = Fs.a();
(ctx->bs[0])[stop] = Bs.r();
(ctx->bs[1])[stop] = Bs.g();
(ctx->bs[2])[stop] = Bs.b();
(ctx->bs[3])[stop] = Bs.a();
};
auto init_stop = [&](int stop, SkPM4f c_l, SkPM4f c_r) {
auto Fs = SkPM4f::From4f((c_r.to4f() - c_l.to4f()) * gapCount);
auto Bs = SkPM4f::From4f(c_l.to4f() - (Fs.to4f() * (stop / gapCount)));
add_stop(stop, Fs, Bs);
};
SkPM4f c_l = prepareColor(0);
for (int i = 0; i < fColorCount - 1; i++) {
// Use multiply instead of accumulating error using repeated addition.
float t_r = (i + 1) * dt;
SkPM4f c_r = prepareColor(i + 1);
init_stop(t_l, t_r, c_l, c_r, &stopsArray[i]);
t_l = t_r;
init_stop(i, c_l, c_r);
c_l = c_r;
}
// Force the last stop.
stopsArray[fColorCount - 1].t = 1;
stopsArray[fColorCount - 1].f = SkPM4f::From4f(Sk4f{0});
stopsArray[fColorCount - 1].b = prepareColor(fColorCount - 1);
// Add the last stop.
add_stop(stopCount - 1, SkPM4f::FromPremulRGBA(0,0,0,0), c_l);
ctx->n = fColorCount;
ctx->stops = stopsArray;
ctx->stopCount = stopCount;
p->append(SkRasterPipeline::evenly_spaced_linear_gradient, ctx);
} else {
// Handle arbitrary stops.
struct Stop { float t; SkPM4f f, b; };
struct Ctx { size_t n; Stop* stops; SkPM4f start; };
auto* ctx = alloc->make<Ctx>();
ctx->start = prepareColor(0);
// For each stop we calculate a bias B and a scale factor F, such that
// for any t between stops n and n+1, the color we want is B[n] + F[n]*t.
auto init_stop = [](float t_l, float t_r, SkPM4f c_l, SkPM4f c_r, Stop *stop) {
auto F = SkPM4f::From4f((c_r.to4f() - c_l.to4f()) / (t_r - t_l));
auto B = SkPM4f::From4f(c_l.to4f() - (F.to4f() * t_l));
*stop = {t_l, F, B};
};
// Remove the dummy stops inserted by SkGradientShaderBase::SkGradientShaderBase
// because they are naturally handled by the search method.
int firstStop;
@ -491,9 +515,8 @@ bool SkGradientShaderBase::onAppendStages(SkRasterPipeline* p,
ctx->n = stopCount;
ctx->stops = stopsArray;
p->append(SkRasterPipeline::linear_gradient, ctx);
}
p->append(SkRasterPipeline::linear_gradient, ctx);
}
if (!premulGrad && !this->colorsAreOpaque()) {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1034,6 +1034,52 @@ STAGE(matrix_perspective) {
g = G * rcp(Z);
}
STAGE(evenly_spaced_linear_gradient) {
struct Ctx {
size_t stopCount;
float* fs[4];
float* bs[4];
};
auto c = (const Ctx*)ctx;
auto t = r;
auto i = trunc_(t*(c->stopCount - 1));
#if defined(JUMPER) && defined(__AVX2__)
if (c->stopCount <=8) {
auto fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), i);
auto br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), i);
auto fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), i);
auto bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), i);
auto fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), i);
auto bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), i);
auto fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), i);
auto ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), i);
r = mad(t, fr, br);
g = mad(t, fg, bg);
b = mad(t, fb, bb);
a = mad(t, fa, ba);
} else
#endif
{
auto fr = gather(c->fs[0], i);
auto br = gather(c->bs[0], i);
auto fg = gather(c->fs[1], i);
auto bg = gather(c->bs[1], i);
auto fb = gather(c->fs[2], i);
auto bb = gather(c->bs[2], i);
auto fa = gather(c->fs[3], i);
auto ba = gather(c->bs[3], i);
r = mad(t, fr, br);
g = mad(t, fg, bg);
b = mad(t, fb, bb);
a = mad(t, fa, ba);
}
}
STAGE(linear_gradient) {
struct Stop { float pos; float f[4], b[4]; };
struct Ctx { size_t n; Stop *stops; float start[4]; };