Rework out-of-gamut handling in SkRasterPipeline

Instead of trying to carefully manage the in-gamut / out-of-gamut state of the pipeline, let's do what a GPU would do, clamping to representable range in any float -> integer conversion. Most effects doing table lookups now clamp themselves internally, and the store_foo() methods clamp when the destination is fixed point. In turn the from_srgb() conversions and all future transfer function stages can care less about this stuff. If I'm thinking right, the _lowp side of things need not change at all, and that will soften the performance impact of this change. Anything that was fast to begin with was probably running a _lowp pipeline. Bug: skia:7419 Change-Id: Id2e080ac240a97b900a1ac131c85d9e15f70af32 Reviewed-on: https://skia-review.googlesource.com/85740 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Brian Osman <brianosman@google.com>
2017-12-15 09:55:03 -05:00 · 2017-12-15 09:55:03 -05:00 · 37155d476c
commit 37155d476c
parent 5b92ce1b24
8 changed files with 14924 additions and 13709 deletions
--- a/src/core/SkRasterPipeline.cpp
+++ b/src/core/SkRasterPipeline.cpp
@ -18,7 +18,6 @@ void SkRasterPipeline::reset() {
    fStages      = nullptr;
    fNumStages   = 0;
    fSlotsNeeded = 1;  // We always need one extra slot for just_return().
-    fClamped     = true;
 }

 void SkRasterPipeline::append(StockStage stage, void* ctx) {
@ -52,7 +51,6 @@ void SkRasterPipeline::extend(const SkRasterPipeline& src) {
    fStages = &stages[src.fNumStages - 1];
    fNumStages   += src.fNumStages;
    fSlotsNeeded += src.fSlotsNeeded - 1;  // Don't double count just_returns().
-    fClamped = fClamped && src.fClamped;
 }

 void SkRasterPipeline::dump() const {
@ -125,26 +123,13 @@ void SkRasterPipeline::append_constant_color(SkArenaAlloc* alloc, const float rg
 #undef INC_WHITE
 #undef INC_COLOR

-// It's pretty easy to start with sound premultiplied linear floats, pack those
-// to sRGB encoded bytes, then read them back to linear floats and find them not
-// quite premultiplied, with a color channel just a smidge greater than the alpha
-// channel.  This can happen basically any time we have different transfer
-// functions for alpha and colors... sRGB being the only one we draw into.
-
-// This is an annoying problem with no known good solution.  So apply the clamp hammer.
-
-void SkRasterPipeline::append_from_srgb(SkAlphaType at) {
+// TODO: we used to clamp to [0,a]] here if at == kPremul, but don't anymore.
+// These should no longer need to be special append() methods.
+void SkRasterPipeline::append_from_srgb(SkAlphaType) {
    this->unchecked_append(from_srgb, nullptr);
-    if (at == kPremul_SkAlphaType) {
-        this->append(SkRasterPipeline::clamp_a);
-    }
 }
-
-void SkRasterPipeline::append_from_srgb_dst(SkAlphaType at) {
+void SkRasterPipeline::append_from_srgb_dst(SkAlphaType) {
    this->unchecked_append(from_srgb_dst, nullptr);
-    if (at == kPremul_SkAlphaType) {
-        this->append(SkRasterPipeline::clamp_a_dst);
-    }
 }

 //static int gCounts[5] = { 0, 0, 0, 0, 0 };
@ -189,15 +174,6 @@ void SkRasterPipeline::append_matrix(SkArenaAlloc* alloc, const SkMatrix& matrix
    }
 }

-void SkRasterPipeline::clamp_if_unclamped(SkAlphaType alphaType) {
-    if (!fClamped) {
-        this->append(SkRasterPipeline::clamp_0);
-        this->append(alphaType == kPremul_SkAlphaType ? SkRasterPipeline::clamp_a
-                                                      : SkRasterPipeline::clamp_1);
-        fClamped = true;
-    }
-}
-
 void SkRasterPipeline::append_seed_shader() {
    static const float iota[] = {
        0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@ -150,11 +150,6 @@ public:

    bool empty() const { return fStages == nullptr; }

-    // Used to track if we're handling values outside [0.0f, 1.0f],
-    // and to clamp back to [0.0f, 1.0f] if so.
-    void set_clamped(bool clamped) { fClamped = clamped; }
-    void clamp_if_unclamped(SkAlphaType);
-
 private:
    struct StageList {
        StageList* prev;
@ -169,7 +164,6 @@ private:
    StageList*    fStages;
    int           fNumStages;
    int           fSlotsNeeded;
-    bool          fClamped;
 };

 template <size_t bytes>
--- a/src/core/SkRasterPipelineBlitter.cpp
+++ b/src/core/SkRasterPipelineBlitter.cpp
@ -258,10 +258,6 @@ void SkRasterPipelineBlitter::append_store(SkRasterPipeline* p) const {
        p->append(SkRasterPipeline::dither, &fDitherRate);
    }

-    if (fDst.info().colorType() != kRGBA_F16_SkColorType) {
-        p->clamp_if_unclamped(kPremul_SkAlphaType);
-    }
-
    switch (fDst.info().colorType()) {
        case kGray_8_SkColorType:    p->append(SkRasterPipeline::luminance_to_alpha); // fallthru
        case kAlpha_8_SkColorType:   p->append(SkRasterPipeline::store_a8,   &fDstPtr); break;
@ -311,7 +307,6 @@ void SkRasterPipelineBlitter::blitRect(int x, int y, int w, int h) {
                && !fDst.colorSpace()
                && fDst.info().alphaType() != kUnpremul_SkAlphaType
                && fDitherRate == 0.0f) {
-            p.clamp_if_unclamped(kPremul_SkAlphaType);
            auto stage = fDst.info().colorType() == kRGBA_8888_SkColorType
                       ? SkRasterPipeline::srcover_rgba_8888
                       : SkRasterPipeline::srcover_bgra_8888;
--- a/src/effects/SkTableColorFilter.cpp
+++ b/src/effects/SkTableColorFilter.cpp
@ -113,13 +113,6 @@ public:
        if (fFlags & kG_Flag) { g = ptr; ptr += 256; }
        if (fFlags & kB_Flag) { b = ptr;             }

-        // If our inputs are out of range, we'd attempt to read values outside our tables.
-        // We could finesse this with p->clamp_if_unclamped(kPremul_SkAlphaType) here, but
-        // this filter is already slow enough that I'd rather just be paranoid and safe.
-        p->append(SkRasterPipeline::clamp_0);
-        p->append(SkRasterPipeline::clamp_a);
-        p->set_clamped(true);
-
        if (!shaderIsOpaque) {
            p->append(SkRasterPipeline::unpremul);
        }
--- a/src/effects/SkToSRGBColorFilter.cpp
+++ b/src/effects/SkToSRGBColorFilter.cpp
@ -40,17 +40,12 @@ void SkToSRGBColorFilter::onAppendStages(SkRasterPipeline* p,
    }

    // Step 2: Transform to sRGB gamut, without clamping.
+    // TODO: because...
    float* gamut_transform = alloc->makeArrayDefault<float>(12);
-    if (append_gamut_transform_noclamp(p,
-                                       gamut_transform,
-                                       fSrcColorSpace.get(),
-                                       SkColorSpace::MakeSRGB().get())) {
-        bool needs_clamp_0, needs_clamp_1;
-        analyze_3x4_matrix(gamut_transform, &needs_clamp_0, &needs_clamp_1);
-        if (needs_clamp_0 || needs_clamp_1) {
-            p->set_clamped(false);
-        }
-    }
+    (void)append_gamut_transform_noclamp(p,
+                                         gamut_transform,
+                                         fSrcColorSpace.get(),
+                                         SkColorSpace::MakeSRGB().get());

    // Step 3: Back to sRGB encoding.
    p->append(SkRasterPipeline::to_srgb);
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@ -208,6 +208,19 @@ SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
    return trunc_(y)*ctx->stride + trunc_(x);
 }

+// We often have a nominally [0,1] float value we need to scale and convert to an integer,
+// whether for a table lookup or to pack back down into bytes for storage.
+//
+// In practice, especially when dealing with interesting color spaces, that notionally
+// [0,1] float may be out of [0,1] range.  Unorms cannot represent that, so we must clamp.
+//
+// You can adjust the expected input to [0,bias] by tweaking that parameter.
+SI U32 to_unorm(F v, F scale, F bias = 1.0f) {
+    // TODO: platform-specific implementations to to_unorm(), removing round() entirely?
+    // Any time we use round() we probably want to use to_unorm().
+    return round(min(max(0, v), bias), scale);
+}
+
 // Now finally, normal Stages!

 STAGE(seed_shader, const float* iota) {
@ -486,18 +499,19 @@ STAGE(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) {
    db = cast((dst >> 16) & 0xff);
    da = cast((dst >> 24)       );
    // {dr,dg,db,da} are in [0,255]
-    // { r, g, b, a} are in [0,  1]
+    // { r, g, b, a} are in [0,  1] (but may be out of gamut)

    r = mad(dr, inv(a), r*255.0f);
    g = mad(dg, inv(a), g*255.0f);
    b = mad(db, inv(a), b*255.0f);
    a = mad(da, inv(a), a*255.0f);
-    // { r, g, b, a} are now in [0,255]
+    // { r, g, b, a} are now in [0,255]  (but may be out of gamut)

-    dst = round(r, 1.0f)
-        | round(g, 1.0f) <<  8
-        | round(b, 1.0f) << 16
-        | round(a, 1.0f) << 24;
+    // to_unorm() clamps back to gamut.  Scaling by 1 since we're already 255-biased.
+    dst = to_unorm(r, 1, 255)
+        | to_unorm(g, 1, 255) <<  8
+        | to_unorm(b, 1, 255) << 16
+        | to_unorm(a, 1, 255) << 24;
    store(ptr, dst, tail);
 }

@ -510,18 +524,19 @@ STAGE(srcover_bgra_8888, const SkJumper_MemoryCtx* ctx) {
    dr = cast((dst >> 16) & 0xff);
    da = cast((dst >> 24)       );
    // {dr,dg,db,da} are in [0,255]
-    // { r, g, b, a} are in [0,  1]
+    // { r, g, b, a} are in [0,  1] (but may be out of gamut)

    r = mad(dr, inv(a), r*255.0f);
    g = mad(dg, inv(a), g*255.0f);
    b = mad(db, inv(a), b*255.0f);
    a = mad(da, inv(a), a*255.0f);
-    // { r, g, b, a} are now in [0,255]
+    // { r, g, b, a} are now in [0,255]  (but may be out of gamut)

-    dst = round(b, 1.0f)
-        | round(g, 1.0f) <<  8
-        | round(r, 1.0f) << 16
-        | round(a, 1.0f) << 24;
+    // to_unorm() clamps back to gamut.  Scaling by 1 since we're already 255-biased.
+    dst = to_unorm(b, 1, 255)
+        | to_unorm(g, 1, 255) <<  8
+        | to_unorm(r, 1, 255) << 16
+        | to_unorm(a, 1, 255) << 24;
    store(ptr, dst, tail);
 }

@ -796,24 +811,24 @@ STAGE(byte_tables, const void* ctx) {  // TODO: rename Tables SkJumper_ByteTable
    struct Tables { const uint8_t *r, *g, *b, *a; };
    auto tables = (const Tables*)ctx;

-    r = from_byte(gather(tables->r, round(r, 255.0f)));
-    g = from_byte(gather(tables->g, round(g, 255.0f)));
-    b = from_byte(gather(tables->b, round(b, 255.0f)));
-    a = from_byte(gather(tables->a, round(a, 255.0f)));
+    r = from_byte(gather(tables->r, to_unorm(r, 255)));
+    g = from_byte(gather(tables->g, to_unorm(g, 255)));
+    b = from_byte(gather(tables->b, to_unorm(b, 255)));
+    a = from_byte(gather(tables->a, to_unorm(a, 255)));
 }

 STAGE(byte_tables_rgb, const void* ctx) {  // TODO: rename Tables SkJumper_ByteTablesRGBCtx
    struct Tables { const uint8_t *r, *g, *b; int n; };
    auto tables = (const Tables*)ctx;

-    F scale = tables->n - 1;
-    r = from_byte(gather(tables->r, round(r, scale)));
-    g = from_byte(gather(tables->g, round(g, scale)));
-    b = from_byte(gather(tables->b, round(b, scale)));
+    int scale = tables->n - 1;
+    r = from_byte(gather(tables->r, to_unorm(r, scale)));
+    g = from_byte(gather(tables->g, to_unorm(g, scale)));
+    b = from_byte(gather(tables->b, to_unorm(b, scale)));
 }

 SI F table(F v, const SkJumper_TableCtx* ctx) {
-    return gather(ctx->table, round(v, ctx->size - 1));
+    return gather(ctx->table, to_unorm(v, ctx->size - 1));
 }
 STAGE(table_r, const SkJumper_TableCtx* ctx) { r = table(r, ctx); }
 STAGE(table_g, const SkJumper_TableCtx* ctx) { g = table(g, ctx); }
@ -881,7 +896,7 @@ STAGE(gather_a8, const SkJumper_GatherCtx* ctx) {
 STAGE(store_a8, const SkJumper_MemoryCtx* ctx) {
    auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);

-    U8 packed = pack(pack(round(a, 255.0f)));
+    U8 packed = pack(pack(to_unorm(a, 255)));
    store(ptr, packed, tail);
 }

@ -925,9 +940,9 @@ STAGE(gather_565, const SkJumper_GatherCtx* ctx) {
 STAGE(store_565, const SkJumper_MemoryCtx* ctx) {
    auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);

-    U16 px = pack( round(r, 31.0f) << 11
-                 | round(g, 63.0f) <<  5
-                 | round(b, 31.0f)      );
+    U16 px = pack( to_unorm(r, 31) << 11
+                 | to_unorm(g, 63) <<  5
+                 | to_unorm(b, 31)      );
    store(ptr, px, tail);
 }

@ -946,10 +961,10 @@ STAGE(gather_4444, const SkJumper_GatherCtx* ctx) {
 }
 STAGE(store_4444, const SkJumper_MemoryCtx* ctx) {
    auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
-    U16 px = pack( round(r, 15.0f) << 12
-                 | round(g, 15.0f) <<  8
-                 | round(b, 15.0f) <<  4
-                 | round(a, 15.0f)      );
+    U16 px = pack( to_unorm(r, 15) << 12
+                 | to_unorm(g, 15) <<  8
+                 | to_unorm(b, 15) <<  4
+                 | to_unorm(a, 15)      );
    store(ptr, px, tail);
 }

@ -969,10 +984,10 @@ STAGE(gather_8888, const SkJumper_GatherCtx* ctx) {
 STAGE(store_8888, const SkJumper_MemoryCtx* ctx) {
    auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);

-    U32 px = round(r, 255.0f)
-           | round(g, 255.0f) <<  8
-           | round(b, 255.0f) << 16
-           | round(a, 255.0f) << 24;
+    U32 px = to_unorm(r, 255)
+           | to_unorm(g, 255) <<  8
+           | to_unorm(b, 255) << 16
+           | to_unorm(a, 255) << 24;
    store(ptr, px, tail);
 }

@ -992,10 +1007,10 @@ STAGE(gather_bgra, const SkJumper_GatherCtx* ctx) {
 STAGE(store_bgra, const SkJumper_MemoryCtx* ctx) {
    auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);

-    U32 px = round(b, 255.0f)
-           | round(g, 255.0f) <<  8
-           | round(r, 255.0f) << 16
-           | round(a, 255.0f) << 24;
+    U32 px = to_unorm(b, 255)
+           | to_unorm(g, 255) <<  8
+           | to_unorm(r, 255) << 16
+           | to_unorm(a, 255) << 24;
    store(ptr, px, tail);
 }

@ -1064,10 +1079,10 @@ STAGE(load_rgb_u16_be, const SkJumper_MemoryCtx* ctx) {
 STAGE(store_u16_be, const SkJumper_MemoryCtx* ctx) {
    auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,dy);

-    U16 R = bswap(pack(round(r, 65535.0f))),
-        G = bswap(pack(round(g, 65535.0f))),
-        B = bswap(pack(round(b, 65535.0f))),
-        A = bswap(pack(round(a, 65535.0f)));
+    U16 R = bswap(pack(to_unorm(r, 65535))),
+        G = bswap(pack(to_unorm(g, 65535))),
+        B = bswap(pack(to_unorm(b, 65535))),
+        A = bswap(pack(to_unorm(a, 65535)));

    store4(ptr,tail, R,G,B,A);
 }