Reland "Add ConvertPixels versions of PremulAlphaRoundTrip"

This is a reland of b61804e94c Original change's description: > Add ConvertPixels versions of PremulAlphaRoundTrip > > Prior to the force_highp trick, the GrConvertPixels version failed, just > like the GPU would do if we disabled the canvas2D fast path. With the > highp trick, all tests pass. > > Bug: skia:12592 > Change-Id: I63ad2fd3b67863b6a736316e7c7b3b9bd2ee8970 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/467516 > Commit-Queue: Brian Osman <brianosman@google.com> > Reviewed-by: Brian Salomon <bsalomon@google.com> Bug: skia:12592 Change-Id: I4c83c8d20959ab13cc493748a22fff5133706a77 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/468458 Reviewed-by: Brian Salomon <bsalomon@google.com> Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Brian Osman <brianosman@google.com>
2021-11-03 15:19:28 -04:00 · 2021-11-03 15:19:28 -04:00 · 1b61d866a6
commit 1b61d866a6
parent fa26a656cf
2 changed files with 160 additions and 13 deletions
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@ -3076,16 +3076,46 @@ static void start_pipeline(const size_t x0,     const size_t y0,

 // ~~~~~~ Commonly used helper functions ~~~~~~ //

+/**
+ * Helpers to to properly rounded division (by 255). The ideal answer we want to compute is slow,
+ * thanks to a division by a non-power of two:
+ *   [1]  (v + 127) / 255
+ *
+ * There is a two-step process that computes the correct answer for all inputs:
+ *   [2]  (v + 128 + ((v + 128) >> 8)) >> 8
+ *
+ * There is also a single iteration approximation, but it's wrong (+-1) ~25% of the time:
+ *   [3]  (v + 255) >> 8;
+ *
+ * We offer two different implementations here, depending on the requirements of the calling stage.
+ */
+
+/**
+ * div255 favors speed over accuracy. It uses formula [2] on NEON (where we can compute it as fast
+ * as [3]), and uses [3] elsewhere.
+ */
 SI U16 div255(U16 v) {
-#if 0
-    return (v+127)/255;  // The ideal rounding divide by 255.
-#elif 1 && defined(JUMPER_IS_NEON)
-    // With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8
-    // just as fast as we can do the approximation below, so might as well be correct!
-    // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up.
+#if defined(JUMPER_IS_NEON)
+    // With NEON we can compute [2] just as fast as [3], so let's be correct.
+    // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up:
    return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8);
 #else
-    return (v+255)/256;  // A good approximation of (v+127)/255.
+    // Otherwise, use [3], which is never wrong by more than 1:
+    return (v+255)/256;
+#endif
+}
+
+/**
+ * div255_accurate guarantees the right answer on all platforms, at the expense of performance.
+ */
+SI U16 div255_accurate(U16 v) {
+#if defined(JUMPER_IS_NEON)
+    // Our NEON implementation of div255 is already correct for all inputs:
+    return div255(v);
+#else
+    // This is [2] (the same formulation as NEON), but written without the benefit of intrinsics:
+    v += 128;
+    return (v+(v/256))/256;
 #endif
 }

@ -3315,14 +3345,14 @@ STAGE_PP(clamp_gamut, Ctx::None) {
 }

 STAGE_PP(premul, Ctx::None) {
-    r = div255(r * a);
-    g = div255(g * a);
-    b = div255(b * a);
+    r = div255_accurate(r * a);
+    g = div255_accurate(g * a);
+    b = div255_accurate(b * a);
 }
 STAGE_PP(premul_dst, Ctx::None) {
-    dr = div255(dr * da);
-    dg = div255(dg * da);
-    db = div255(db * da);
+    dr = div255_accurate(dr * da);
+    dg = div255_accurate(dg * da);
+    db = div255_accurate(db * da);
 }

 STAGE_PP(force_opaque    , Ctx::None) {  a = 255; }
--- a/tests/PremulAlphaRoundTripTest.cpp
+++ b/tests/PremulAlphaRoundTripTest.cpp
@ -9,6 +9,8 @@
 #include "include/core/SkCanvas.h"
 #include "include/core/SkSurface.h"
 #include "include/gpu/GrDirectContext.h"
+#include "src/core/SkConvertPixels.h"
+#include "src/gpu/GrPixmap.h"
 #include "tests/Test.h"
 #include "tools/ToolUtils.h"

@ -105,3 +107,118 @@ DEF_GPUTEST_FOR_RENDERING_CONTEXTS(PremulAlphaRoundTrip_Gpu, reporter, ctxInfo)
                                                      SkBudgeted::kNo, info));
    test_premul_alpha_roundtrip(reporter, surf.get());
 }
+
+DEF_TEST(PremulAlphaRoundTripGrConvertPixels, reporter) {
+    // Code that does the same thing as above, but using GrConvertPixels. This simulates what
+    // happens if you run the above on a machine with a GPU that doesn't have a valid PM/UPM
+    // conversion pair of FPs.
+    const SkImageInfo upmInfo =
+            SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
+    const SkImageInfo pmInfo =
+            SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kPremul_SkAlphaType);
+
+    GrPixmap src = GrPixmap::Allocate(upmInfo);
+    uint32_t* srcPixels = (uint32_t*)src.addr();
+    for (int y = 0; y < 256; ++y) {
+        for (int x = 0; x < 256; ++x) {
+            srcPixels[y * 256 + x] = pack_unpremul_rgba(SkColorSetARGB(y, x, x, x));
+        }
+    }
+
+    GrPixmap surf = GrPixmap::Allocate(pmInfo);
+    GrConvertPixels(surf, src);
+
+    GrPixmap read1 = GrPixmap::Allocate(upmInfo);
+    GrConvertPixels(read1, surf);
+
+    GrPixmap surf2 = GrPixmap::Allocate(pmInfo);
+    GrConvertPixels(surf2, read1);
+
+    GrPixmap read2 = GrPixmap::Allocate(upmInfo);
+    GrConvertPixels(read2, surf2);
+
+    auto get_pixel = [](const GrPixmap& pm, int x, int y) {
+        const uint32_t* addr = (const uint32_t*)pm.addr();
+        return addr[y * 256 + x];
+    };
+    auto dump_pixel_history = [&](int x, int y) {
+        SkDebugf("Pixel history for (%d, %d):\n", x, y);
+        SkDebugf("Src : %08x\n", get_pixel(src, x, y));
+        SkDebugf(" -> : %08x\n", get_pixel(surf, x, y));
+        SkDebugf(" <- : %08x\n", get_pixel(read1, x, y));
+        SkDebugf(" -> : %08x\n", get_pixel(surf2, x, y));
+        SkDebugf(" <- : %08x\n", get_pixel(read2, x, y));
+    };
+
+    bool success = true;
+    for (int y = 0; y < 256 && success; ++y) {
+        const uint32_t* pixels1 = (const uint32_t*) read1.addr();
+        const uint32_t* pixels2 = (const uint32_t*) read2.addr();
+        for (int x = 0; x < 256 && success; ++x) {
+            uint32_t c1 = pixels1[y * 256 + x],
+                     c2 = pixels2[y * 256 + x];
+            // If this ever fails, it's helpful to see where it goes wrong.
+            if (c1 != c2) {
+                dump_pixel_history(x, y);
+            }
+            REPORTER_ASSERT(reporter, success = c1 == c2);
+        }
+    }
+}
+
+DEF_TEST(PremulAlphaRoundTripSkConvertPixels, reporter) {
+    // ... and now using SkConvertPixels, just for completeness
+    const SkImageInfo upmInfo =
+            SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
+    const SkImageInfo pmInfo =
+            SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kPremul_SkAlphaType);
+
+    SkBitmap src; src.allocPixels(upmInfo);
+    uint32_t* srcPixels = src.getAddr32(0, 0);
+    for (int y = 0; y < 256; ++y) {
+        for (int x = 0; x < 256; ++x) {
+            srcPixels[y * 256 + x] = pack_unpremul_rgba(SkColorSetARGB(y, x, x, x));
+        }
+    }
+
+    auto convert = [](const SkBitmap& dst, const SkBitmap& src){
+        SkAssertResult(SkConvertPixels(dst.info(), dst.getAddr(0, 0), dst.rowBytes(),
+                                       src.info(), src.getAddr(0, 0), src.rowBytes()));
+    };
+
+    SkBitmap surf; surf.allocPixels(pmInfo);
+    convert(surf, src);
+
+    SkBitmap read1; read1.allocPixels(upmInfo);
+    convert(read1, surf);
+
+    SkBitmap surf2; surf2.allocPixels(pmInfo);
+    convert(surf2, read1);
+
+    SkBitmap read2; read2.allocPixels(upmInfo);
+    convert(read2, surf2);
+
+    auto dump_pixel_history = [&](int x, int y) {
+        SkDebugf("Pixel history for (%d, %d):\n", x, y);
+        SkDebugf("Src : %08x\n", *src.getAddr32(x, y));
+        SkDebugf(" -> : %08x\n", *surf.getAddr32(x, y));
+        SkDebugf(" <- : %08x\n", *read1.getAddr32(x, y));
+        SkDebugf(" -> : %08x\n", *surf2.getAddr32(x, y));
+        SkDebugf(" <- : %08x\n", *read2.getAddr32(x, y));
+    };
+
+    bool success = true;
+    for (int y = 0; y < 256 && success; ++y) {
+        const uint32_t* pixels1 = read1.getAddr32(0, 0);
+        const uint32_t* pixels2 = read2.getAddr32(0, 0);
+        for (int x = 0; x < 256 && success; ++x) {
+            uint32_t c1 = pixels1[y * 256 + x],
+                     c2 = pixels2[y * 256 + x];
+            // If this ever fails, it's helpful to see where it goes wrong.
+            if (c1 != c2) {
+                dump_pixel_history(x, y);
+            }
+            REPORTER_ASSERT(reporter, success = c1 == c2);
+        }
+    }
+}