jumper, parametric_{r,g,b,a}

I've tried a couple of ideas for approx_powf():
   1) accumulate integer powers of x, then 4th roots, then 16th roots
   2) continue 1) all the way to 256th roots
   3) decompose into pow2 and log2, exploiting IEEE float layout
   4) slightly tune constants used in 3)
   5) accumulate integer powers of x, then 3+4) with different tuning
   6) follow a source online, basically 5 with finesse
   7) a new source quoting and improving on the method in 6).

7) seems perfect, enough that maybe we can explore improving its speed
at cost of precision.  Might be nice to get rid of those divides.  If we
allow a small tolerance (2-5) in our tests, we could use the very simple
fast forms from 3) (e.g. PS 5).  I wish I had some images to look at!

Anything involving roots seems to be subverted by poor rsqrt precision.

This change of course affects the pipelines created by the tests for
exponential and full parametric gamma curves.  What's less obvious is
that it also means SkJumper can now for the first time run the pipeline
created by the mixed gamma curves test.  This means we now need to relax
our tolerance for the table-based channel, just like we did when
implementing table_{r,g,b,a}.

This took me an embarassingly long time to figure out.  *face palm*

Change-Id: I451ee3c970a0a4a4e285f8aa8f6ef709a654d247
Reviewed-on: https://skia-review.googlesource.com/13656
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Matt Sarett <msarett@google.com>
Reviewed-by: Herb Derby <herb@google.com>
This commit is contained in:
Mike Klein 2017-04-17 19:32:05 -04:00 committed by Skia Commit-Bot
parent 8f2911f840
commit 44375176c0
7 changed files with 4245 additions and 425 deletions

View File

@ -92,6 +92,7 @@ static K kConstants = {
M(byte_tables) \ M(byte_tables) \
M(byte_tables_rgb) \ M(byte_tables_rgb) \
M(table_r) M(table_g) M(table_b) M(table_a) \ M(table_r) M(table_g) M(table_b) M(table_a) \
M(parametric_r) M(parametric_g) M(parametric_b) M(parametric_a) \
M(load_a8) \ M(load_a8) \
M(gather_a8) \ M(gather_a8) \
M(store_a8) \ M(store_a8) \

View File

@ -96,4 +96,9 @@ struct SkJumper_TableCtx {
int size; int size;
}; };
// This should line up with the memory layout of SkColorSpaceTransferFn.
struct SkJumper_ParametricTransferFunction {
float G, A,B,C,D,E,F;
};
#endif//SkJumper_DEFINED #endif//SkJumper_DEFINED

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -671,6 +671,40 @@ STAGE(table_g) { g = table(g, ctx); }
STAGE(table_b) { b = table(b, ctx); } STAGE(table_b) { b = table(b, ctx); }
STAGE(table_a) { a = table(a, ctx); } STAGE(table_a) { a = table(a, ctx); }
// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
SI F approx_log2(F x) {
// e is a fair approximation of log2(x) in its own right...
F e = cast(bit_cast<U32>(x)) * C(1.0f / (1<<23)) - 127.0_f;
// ... but using the mantissa to refine its error is _much_ better.
F m = bit_cast<F>((bit_cast<U32>(x) & 0x007fffff_i) | 0x3f000000_i);
return e
+ 2.774485010_f
- 1.498030302_f * m
- 1.725879990_f / (0.3520887068_f + m);
}
SI F approx_pow2(F x) {
F f = fract(x);
return bit_cast<F>(round(C(1.0f * (1<<23)),
x + 121.2740575_f
- 1.490129070_f * f
+ 27.72802330_f / (4.84252568_f - f)));
}
SI F approx_powf(F x, float g) {
return approx_pow2(approx_log2(x) * g);
}
SI F parametric(F v, const SkJumper_ParametricTransferFunction* ctx) {
F r = if_then_else(v <= ctx->D, mad(ctx->C, v, ctx->F)
, approx_powf(mad(ctx->A, v, ctx->B), ctx->G) + ctx->E);
return min(max(r, 0), 1.0_f); // Clamp to [0,1], with argument order mattering to handle NaN.
}
STAGE(parametric_r) { r = parametric(r, ctx); }
STAGE(parametric_g) { g = parametric(g, ctx); }
STAGE(parametric_b) { b = parametric(b, ctx); }
STAGE(parametric_a) { a = parametric(a, ctx); }
STAGE(load_a8) { STAGE(load_a8) {
auto ptr = *(const uint8_t**)ctx + x; auto ptr = *(const uint8_t**)ctx + x;
@ -954,7 +988,6 @@ STAGE(save_xy) {
// Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy). // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
// They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
// surrounding (x,y) at (0.5,0.5) off-center. // surrounding (x,y) at (0.5,0.5) off-center.
auto fract = [](F v) { return v - floor_(v); };
F fx = fract(r + 0.5_f), F fx = fract(r + 0.5_f),
fy = fract(g + 0.5_f); fy = fract(g + 0.5_f);

View File

@ -626,4 +626,6 @@ SI U16 bswap(U16 x) {
#endif #endif
} }
SI F fract(F v) { return v - floor_(v); }
#endif//SkJumper_vectors_DEFINED #endif//SkJumper_vectors_DEFINED

View File

@ -252,6 +252,9 @@ DEF_TEST(ColorSpaceXform_NonMatchingGamma, r) {
gammas->fType[0] = SkGammas::Type::kValue_Type; gammas->fType[0] = SkGammas::Type::kValue_Type;
gammas->fData[0].fValue = 1.2f; gammas->fData[0].fValue = 1.2f;
// See ColorSpaceXform_TableGamma... we've decided to allow some tolerance
// for SkJumper's implementation of tables.
const int tolerance = 12;
gammas->fType[1] = SkGammas::Type::kTable_Type; gammas->fType[1] = SkGammas::Type::kTable_Type;
gammas->fData[1].fTable.fSize = tableSize; gammas->fData[1].fTable.fSize = tableSize;
gammas->fData[1].fTable.fOffset = 0; gammas->fData[1].fTable.fOffset = 0;
@ -260,7 +263,7 @@ DEF_TEST(ColorSpaceXform_NonMatchingGamma, r) {
gammas->fData[2].fParamOffset = sizeof(float) * tableSize; gammas->fData[2].fParamOffset = sizeof(float) * tableSize;
test_identity_xform(r, gammas, true); test_identity_xform(r, gammas, true);
test_identity_xform_A2B(r, kNonStandard_SkGammaNamed, gammas); test_identity_xform_A2B(r, kNonStandard_SkGammaNamed, gammas, tolerance);
} }
DEF_TEST(ColorSpaceXform_A2BCLUT, r) { DEF_TEST(ColorSpaceXform_A2BCLUT, r) {