jumper, parametric_{r,g,b,a}

I've tried a couple of ideas for approx_powf(): 1) accumulate integer powers of x, then 4th roots, then 16th roots 2) continue 1) all the way to 256th roots 3) decompose into pow2 and log2, exploiting IEEE float layout 4) slightly tune constants used in 3) 5) accumulate integer powers of x, then 3+4) with different tuning 6) follow a source online, basically 5 with finesse 7) a new source quoting and improving on the method in 6). 7) seems perfect, enough that maybe we can explore improving its speed at cost of precision. Might be nice to get rid of those divides. If we allow a small tolerance (2-5) in our tests, we could use the very simple fast forms from 3) (e.g. PS 5). I wish I had some images to look at! Anything involving roots seems to be subverted by poor rsqrt precision. This change of course affects the pipelines created by the tests for exponential and full parametric gamma curves. What's less obvious is that it also means SkJumper can now for the first time run the pipeline created by the mixed gamma curves test. This means we now need to relax our tolerance for the table-based channel, just like we did when implementing table_{r,g,b,a}. This took me an embarassingly long time to figure out. *face palm* Change-Id: I451ee3c970a0a4a4e285f8aa8f6ef709a654d247 Reviewed-on: https://skia-review.googlesource.com/13656 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Matt Sarett <msarett@google.com> Reviewed-by: Herb Derby <herb@google.com>
2017-04-17 19:32:05 -04:00 · 2017-04-17 19:32:05 -04:00 · 44375176c0
commit 44375176c0
parent 8f2911f840
7 changed files with 4245 additions and 425 deletions
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@ -92,6 +92,7 @@ static K kConstants = {
    M(byte_tables)        \
    M(byte_tables_rgb)    \
    M(table_r) M(table_g) M(table_b) M(table_a) \
+    M(parametric_r) M(parametric_g) M(parametric_b) M(parametric_a) \
    M(load_a8)            \
    M(gather_a8)          \
    M(store_a8)           \
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@ -96,4 +96,9 @@ struct SkJumper_TableCtx {
    int          size;
 };

+// This should line up with the memory layout of SkColorSpaceTransferFn.
+struct SkJumper_ParametricTransferFunction {
+    float G, A,B,C,D,E,F;
+};
+
 #endif//SkJumper_DEFINED
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@ -671,6 +671,40 @@ STAGE(table_g) { g = table(g, ctx); }
 STAGE(table_b) { b = table(b, ctx); }
 STAGE(table_a) { a = table(a, ctx); }

+// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
+SI F approx_log2(F x) {
+    // e is a fair approximation of log2(x) in its own right...
+    F e = cast(bit_cast<U32>(x)) * C(1.0f / (1<<23)) - 127.0_f;
+
+    // ... but using the mantissa to refine its error is _much_ better.
+    F m = bit_cast<F>((bit_cast<U32>(x) & 0x007fffff_i) | 0x3f000000_i);
+    return e
+         + 2.774485010_f
+         - 1.498030302_f * m
+         - 1.725879990_f / (0.3520887068_f + m);
+}
+SI F approx_pow2(F x) {
+    F f = fract(x);
+    return bit_cast<F>(round(C(1.0f * (1<<23)),
+                x + 121.2740575_f
+                - 1.490129070_f * f
+                + 27.72802330_f / (4.84252568_f - f)));
+}
+
+SI F approx_powf(F x, float g) {
+    return approx_pow2(approx_log2(x) * g);
+}
+
+SI F parametric(F v, const SkJumper_ParametricTransferFunction* ctx) {
+    F r = if_then_else(v <= ctx->D, mad(ctx->C, v, ctx->F)
+                                  , approx_powf(mad(ctx->A, v, ctx->B), ctx->G) + ctx->E);
+    return min(max(r, 0), 1.0_f);  // Clamp to [0,1], with argument order mattering to handle NaN.
+}
+STAGE(parametric_r) { r = parametric(r, ctx); }
+STAGE(parametric_g) { g = parametric(g, ctx); }
+STAGE(parametric_b) { b = parametric(b, ctx); }
+STAGE(parametric_a) { a = parametric(a, ctx); }
+
 STAGE(load_a8) {
    auto ptr = *(const uint8_t**)ctx + x;

@ -954,7 +988,6 @@ STAGE(save_xy) {
    // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
    // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
    // surrounding (x,y) at (0.5,0.5) off-center.
-    auto fract = [](F v) { return v - floor_(v); };
    F fx = fract(r + 0.5_f),
      fy = fract(g + 0.5_f);

--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@ -626,4 +626,6 @@ SI U16 bswap(U16 x) {
 #endif
 }

+SI F fract(F v) { return v - floor_(v); }
+
 #endif//SkJumper_vectors_DEFINED
--- a/tests/ColorSpaceXformTest.cpp
+++ b/tests/ColorSpaceXformTest.cpp
@ -252,6 +252,9 @@ DEF_TEST(ColorSpaceXform_NonMatchingGamma, r) {
    gammas->fType[0] = SkGammas::Type::kValue_Type;
    gammas->fData[0].fValue = 1.2f;

+    // See ColorSpaceXform_TableGamma... we've decided to allow some tolerance
+    // for SkJumper's implementation of tables.
+    const int tolerance = 12;
    gammas->fType[1] = SkGammas::Type::kTable_Type;
    gammas->fData[1].fTable.fSize = tableSize;
    gammas->fData[1].fTable.fOffset = 0;
@ -260,7 +263,7 @@ DEF_TEST(ColorSpaceXform_NonMatchingGamma, r) {
    gammas->fData[2].fParamOffset = sizeof(float) * tableSize;

    test_identity_xform(r, gammas, true);
-    test_identity_xform_A2B(r, kNonStandard_SkGammaNamed, gammas);
+    test_identity_xform_A2B(r, kNonStandard_SkGammaNamed, gammas, tolerance);
 }

 DEF_TEST(ColorSpaceXform_A2BCLUT, r) {