Reland "Add ConvertPixels versions of PremulAlphaRoundTrip"

This is a reland of b61804e94c

Original change's description:
> Add ConvertPixels versions of PremulAlphaRoundTrip
>
> Prior to the force_highp trick, the GrConvertPixels version failed, just
> like the GPU would do if we disabled the canvas2D fast path. With the
> highp trick, all tests pass.
>
> Bug: skia:12592
> Change-Id: I63ad2fd3b67863b6a736316e7c7b3b9bd2ee8970
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/467516
> Commit-Queue: Brian Osman <brianosman@google.com>
> Reviewed-by: Brian Salomon <bsalomon@google.com>

Bug: skia:12592
Change-Id: I4c83c8d20959ab13cc493748a22fff5133706a77
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/468458
Reviewed-by: Brian Salomon <bsalomon@google.com>
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Brian Osman <brianosman@google.com>
This commit is contained in:
Brian Osman 2021-11-03 15:19:28 -04:00 committed by SkCQ
parent fa26a656cf
commit 1b61d866a6
2 changed files with 160 additions and 13 deletions

View File

@ -3076,16 +3076,46 @@ static void start_pipeline(const size_t x0, const size_t y0,
// ~~~~~~ Commonly used helper functions ~~~~~~ //
/**
* Helpers to to properly rounded division (by 255). The ideal answer we want to compute is slow,
* thanks to a division by a non-power of two:
* [1] (v + 127) / 255
*
* There is a two-step process that computes the correct answer for all inputs:
* [2] (v + 128 + ((v + 128) >> 8)) >> 8
*
* There is also a single iteration approximation, but it's wrong (+-1) ~25% of the time:
* [3] (v + 255) >> 8;
*
* We offer two different implementations here, depending on the requirements of the calling stage.
*/
/**
* div255 favors speed over accuracy. It uses formula [2] on NEON (where we can compute it as fast
* as [3]), and uses [3] elsewhere.
*/
SI U16 div255(U16 v) {
#if 0
return (v+127)/255; // The ideal rounding divide by 255.
#elif 1 && defined(JUMPER_IS_NEON)
// With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8
// just as fast as we can do the approximation below, so might as well be correct!
// First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up.
#if defined(JUMPER_IS_NEON)
// With NEON we can compute [2] just as fast as [3], so let's be correct.
// First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up:
return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8);
#else
return (v+255)/256; // A good approximation of (v+127)/255.
// Otherwise, use [3], which is never wrong by more than 1:
return (v+255)/256;
#endif
}
/**
* div255_accurate guarantees the right answer on all platforms, at the expense of performance.
*/
SI U16 div255_accurate(U16 v) {
#if defined(JUMPER_IS_NEON)
// Our NEON implementation of div255 is already correct for all inputs:
return div255(v);
#else
// This is [2] (the same formulation as NEON), but written without the benefit of intrinsics:
v += 128;
return (v+(v/256))/256;
#endif
}
@ -3315,14 +3345,14 @@ STAGE_PP(clamp_gamut, Ctx::None) {
}
STAGE_PP(premul, Ctx::None) {
r = div255(r * a);
g = div255(g * a);
b = div255(b * a);
r = div255_accurate(r * a);
g = div255_accurate(g * a);
b = div255_accurate(b * a);
}
STAGE_PP(premul_dst, Ctx::None) {
dr = div255(dr * da);
dg = div255(dg * da);
db = div255(db * da);
dr = div255_accurate(dr * da);
dg = div255_accurate(dg * da);
db = div255_accurate(db * da);
}
STAGE_PP(force_opaque , Ctx::None) { a = 255; }

View File

@ -9,6 +9,8 @@
#include "include/core/SkCanvas.h"
#include "include/core/SkSurface.h"
#include "include/gpu/GrDirectContext.h"
#include "src/core/SkConvertPixels.h"
#include "src/gpu/GrPixmap.h"
#include "tests/Test.h"
#include "tools/ToolUtils.h"
@ -105,3 +107,118 @@ DEF_GPUTEST_FOR_RENDERING_CONTEXTS(PremulAlphaRoundTrip_Gpu, reporter, ctxInfo)
SkBudgeted::kNo, info));
test_premul_alpha_roundtrip(reporter, surf.get());
}
DEF_TEST(PremulAlphaRoundTripGrConvertPixels, reporter) {
// Code that does the same thing as above, but using GrConvertPixels. This simulates what
// happens if you run the above on a machine with a GPU that doesn't have a valid PM/UPM
// conversion pair of FPs.
const SkImageInfo upmInfo =
SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
const SkImageInfo pmInfo =
SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kPremul_SkAlphaType);
GrPixmap src = GrPixmap::Allocate(upmInfo);
uint32_t* srcPixels = (uint32_t*)src.addr();
for (int y = 0; y < 256; ++y) {
for (int x = 0; x < 256; ++x) {
srcPixels[y * 256 + x] = pack_unpremul_rgba(SkColorSetARGB(y, x, x, x));
}
}
GrPixmap surf = GrPixmap::Allocate(pmInfo);
GrConvertPixels(surf, src);
GrPixmap read1 = GrPixmap::Allocate(upmInfo);
GrConvertPixels(read1, surf);
GrPixmap surf2 = GrPixmap::Allocate(pmInfo);
GrConvertPixels(surf2, read1);
GrPixmap read2 = GrPixmap::Allocate(upmInfo);
GrConvertPixels(read2, surf2);
auto get_pixel = [](const GrPixmap& pm, int x, int y) {
const uint32_t* addr = (const uint32_t*)pm.addr();
return addr[y * 256 + x];
};
auto dump_pixel_history = [&](int x, int y) {
SkDebugf("Pixel history for (%d, %d):\n", x, y);
SkDebugf("Src : %08x\n", get_pixel(src, x, y));
SkDebugf(" -> : %08x\n", get_pixel(surf, x, y));
SkDebugf(" <- : %08x\n", get_pixel(read1, x, y));
SkDebugf(" -> : %08x\n", get_pixel(surf2, x, y));
SkDebugf(" <- : %08x\n", get_pixel(read2, x, y));
};
bool success = true;
for (int y = 0; y < 256 && success; ++y) {
const uint32_t* pixels1 = (const uint32_t*) read1.addr();
const uint32_t* pixels2 = (const uint32_t*) read2.addr();
for (int x = 0; x < 256 && success; ++x) {
uint32_t c1 = pixels1[y * 256 + x],
c2 = pixels2[y * 256 + x];
// If this ever fails, it's helpful to see where it goes wrong.
if (c1 != c2) {
dump_pixel_history(x, y);
}
REPORTER_ASSERT(reporter, success = c1 == c2);
}
}
}
DEF_TEST(PremulAlphaRoundTripSkConvertPixels, reporter) {
// ... and now using SkConvertPixels, just for completeness
const SkImageInfo upmInfo =
SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
const SkImageInfo pmInfo =
SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kPremul_SkAlphaType);
SkBitmap src; src.allocPixels(upmInfo);
uint32_t* srcPixels = src.getAddr32(0, 0);
for (int y = 0; y < 256; ++y) {
for (int x = 0; x < 256; ++x) {
srcPixels[y * 256 + x] = pack_unpremul_rgba(SkColorSetARGB(y, x, x, x));
}
}
auto convert = [](const SkBitmap& dst, const SkBitmap& src){
SkAssertResult(SkConvertPixels(dst.info(), dst.getAddr(0, 0), dst.rowBytes(),
src.info(), src.getAddr(0, 0), src.rowBytes()));
};
SkBitmap surf; surf.allocPixels(pmInfo);
convert(surf, src);
SkBitmap read1; read1.allocPixels(upmInfo);
convert(read1, surf);
SkBitmap surf2; surf2.allocPixels(pmInfo);
convert(surf2, read1);
SkBitmap read2; read2.allocPixels(upmInfo);
convert(read2, surf2);
auto dump_pixel_history = [&](int x, int y) {
SkDebugf("Pixel history for (%d, %d):\n", x, y);
SkDebugf("Src : %08x\n", *src.getAddr32(x, y));
SkDebugf(" -> : %08x\n", *surf.getAddr32(x, y));
SkDebugf(" <- : %08x\n", *read1.getAddr32(x, y));
SkDebugf(" -> : %08x\n", *surf2.getAddr32(x, y));
SkDebugf(" <- : %08x\n", *read2.getAddr32(x, y));
};
bool success = true;
for (int y = 0; y < 256 && success; ++y) {
const uint32_t* pixels1 = read1.getAddr32(0, 0);
const uint32_t* pixels2 = read2.getAddr32(0, 0);
for (int x = 0; x < 256 && success; ++x) {
uint32_t c1 = pixels1[y * 256 + x],
c2 = pixels2[y * 256 + x];
// If this ever fails, it's helpful to see where it goes wrong.
if (c1 != c2) {
dump_pixel_history(x, y);
}
REPORTER_ASSERT(reporter, success = c1 == c2);
}
}
}