diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h index 4ba3f772e8..7948143a4c 100644 --- a/src/opts/SkRasterPipeline_opts.h +++ b/src/opts/SkRasterPipeline_opts.h @@ -3076,16 +3076,46 @@ static void start_pipeline(const size_t x0, const size_t y0, // ~~~~~~ Commonly used helper functions ~~~~~~ // +/** + * Helpers to to properly rounded division (by 255). The ideal answer we want to compute is slow, + * thanks to a division by a non-power of two: + * [1] (v + 127) / 255 + * + * There is a two-step process that computes the correct answer for all inputs: + * [2] (v + 128 + ((v + 128) >> 8)) >> 8 + * + * There is also a single iteration approximation, but it's wrong (+-1) ~25% of the time: + * [3] (v + 255) >> 8; + * + * We offer two different implementations here, depending on the requirements of the calling stage. + */ + +/** + * div255 favors speed over accuracy. It uses formula [2] on NEON (where we can compute it as fast + * as [3]), and uses [3] elsewhere. + */ SI U16 div255(U16 v) { -#if 0 - return (v+127)/255; // The ideal rounding divide by 255. -#elif 1 && defined(JUMPER_IS_NEON) - // With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8 - // just as fast as we can do the approximation below, so might as well be correct! - // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up. +#if defined(JUMPER_IS_NEON) + // With NEON we can compute [2] just as fast as [3], so let's be correct. + // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up: return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8); #else - return (v+255)/256; // A good approximation of (v+127)/255. + // Otherwise, use [3], which is never wrong by more than 1: + return (v+255)/256; +#endif +} + +/** + * div255_accurate guarantees the right answer on all platforms, at the expense of performance. + */ +SI U16 div255_accurate(U16 v) { +#if defined(JUMPER_IS_NEON) + // Our NEON implementation of div255 is already correct for all inputs: + return div255(v); +#else + // This is [2] (the same formulation as NEON), but written without the benefit of intrinsics: + v += 128; + return (v+(v/256))/256; #endif } @@ -3315,14 +3345,14 @@ STAGE_PP(clamp_gamut, Ctx::None) { } STAGE_PP(premul, Ctx::None) { - r = div255(r * a); - g = div255(g * a); - b = div255(b * a); + r = div255_accurate(r * a); + g = div255_accurate(g * a); + b = div255_accurate(b * a); } STAGE_PP(premul_dst, Ctx::None) { - dr = div255(dr * da); - dg = div255(dg * da); - db = div255(db * da); + dr = div255_accurate(dr * da); + dg = div255_accurate(dg * da); + db = div255_accurate(db * da); } STAGE_PP(force_opaque , Ctx::None) { a = 255; } diff --git a/tests/PremulAlphaRoundTripTest.cpp b/tests/PremulAlphaRoundTripTest.cpp index 38ec104801..61542f527f 100644 --- a/tests/PremulAlphaRoundTripTest.cpp +++ b/tests/PremulAlphaRoundTripTest.cpp @@ -9,6 +9,8 @@ #include "include/core/SkCanvas.h" #include "include/core/SkSurface.h" #include "include/gpu/GrDirectContext.h" +#include "src/core/SkConvertPixels.h" +#include "src/gpu/GrPixmap.h" #include "tests/Test.h" #include "tools/ToolUtils.h" @@ -105,3 +107,118 @@ DEF_GPUTEST_FOR_RENDERING_CONTEXTS(PremulAlphaRoundTrip_Gpu, reporter, ctxInfo) SkBudgeted::kNo, info)); test_premul_alpha_roundtrip(reporter, surf.get()); } + +DEF_TEST(PremulAlphaRoundTripGrConvertPixels, reporter) { + // Code that does the same thing as above, but using GrConvertPixels. This simulates what + // happens if you run the above on a machine with a GPU that doesn't have a valid PM/UPM + // conversion pair of FPs. + const SkImageInfo upmInfo = + SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType); + const SkImageInfo pmInfo = + SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kPremul_SkAlphaType); + + GrPixmap src = GrPixmap::Allocate(upmInfo); + uint32_t* srcPixels = (uint32_t*)src.addr(); + for (int y = 0; y < 256; ++y) { + for (int x = 0; x < 256; ++x) { + srcPixels[y * 256 + x] = pack_unpremul_rgba(SkColorSetARGB(y, x, x, x)); + } + } + + GrPixmap surf = GrPixmap::Allocate(pmInfo); + GrConvertPixels(surf, src); + + GrPixmap read1 = GrPixmap::Allocate(upmInfo); + GrConvertPixels(read1, surf); + + GrPixmap surf2 = GrPixmap::Allocate(pmInfo); + GrConvertPixels(surf2, read1); + + GrPixmap read2 = GrPixmap::Allocate(upmInfo); + GrConvertPixels(read2, surf2); + + auto get_pixel = [](const GrPixmap& pm, int x, int y) { + const uint32_t* addr = (const uint32_t*)pm.addr(); + return addr[y * 256 + x]; + }; + auto dump_pixel_history = [&](int x, int y) { + SkDebugf("Pixel history for (%d, %d):\n", x, y); + SkDebugf("Src : %08x\n", get_pixel(src, x, y)); + SkDebugf(" -> : %08x\n", get_pixel(surf, x, y)); + SkDebugf(" <- : %08x\n", get_pixel(read1, x, y)); + SkDebugf(" -> : %08x\n", get_pixel(surf2, x, y)); + SkDebugf(" <- : %08x\n", get_pixel(read2, x, y)); + }; + + bool success = true; + for (int y = 0; y < 256 && success; ++y) { + const uint32_t* pixels1 = (const uint32_t*) read1.addr(); + const uint32_t* pixels2 = (const uint32_t*) read2.addr(); + for (int x = 0; x < 256 && success; ++x) { + uint32_t c1 = pixels1[y * 256 + x], + c2 = pixels2[y * 256 + x]; + // If this ever fails, it's helpful to see where it goes wrong. + if (c1 != c2) { + dump_pixel_history(x, y); + } + REPORTER_ASSERT(reporter, success = c1 == c2); + } + } +} + +DEF_TEST(PremulAlphaRoundTripSkConvertPixels, reporter) { + // ... and now using SkConvertPixels, just for completeness + const SkImageInfo upmInfo = + SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType); + const SkImageInfo pmInfo = + SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kPremul_SkAlphaType); + + SkBitmap src; src.allocPixels(upmInfo); + uint32_t* srcPixels = src.getAddr32(0, 0); + for (int y = 0; y < 256; ++y) { + for (int x = 0; x < 256; ++x) { + srcPixels[y * 256 + x] = pack_unpremul_rgba(SkColorSetARGB(y, x, x, x)); + } + } + + auto convert = [](const SkBitmap& dst, const SkBitmap& src){ + SkAssertResult(SkConvertPixels(dst.info(), dst.getAddr(0, 0), dst.rowBytes(), + src.info(), src.getAddr(0, 0), src.rowBytes())); + }; + + SkBitmap surf; surf.allocPixels(pmInfo); + convert(surf, src); + + SkBitmap read1; read1.allocPixels(upmInfo); + convert(read1, surf); + + SkBitmap surf2; surf2.allocPixels(pmInfo); + convert(surf2, read1); + + SkBitmap read2; read2.allocPixels(upmInfo); + convert(read2, surf2); + + auto dump_pixel_history = [&](int x, int y) { + SkDebugf("Pixel history for (%d, %d):\n", x, y); + SkDebugf("Src : %08x\n", *src.getAddr32(x, y)); + SkDebugf(" -> : %08x\n", *surf.getAddr32(x, y)); + SkDebugf(" <- : %08x\n", *read1.getAddr32(x, y)); + SkDebugf(" -> : %08x\n", *surf2.getAddr32(x, y)); + SkDebugf(" <- : %08x\n", *read2.getAddr32(x, y)); + }; + + bool success = true; + for (int y = 0; y < 256 && success; ++y) { + const uint32_t* pixels1 = read1.getAddr32(0, 0); + const uint32_t* pixels2 = read2.getAddr32(0, 0); + for (int x = 0; x < 256 && success; ++x) { + uint32_t c1 = pixels1[y * 256 + x], + c2 = pixels2[y * 256 + x]; + // If this ever fails, it's helpful to see where it goes wrong. + if (c1 != c2) { + dump_pixel_history(x, y); + } + REPORTER_ASSERT(reporter, success = c1 == c2); + } + } +}