Refactor swizzle names and types.

- Plant a flag to say "pretend all the inputs are RGBA".
    This is how libpng thinks.
    This is the opposite of what the implementation had been doing,
    so I've rearranged everything to reflect the new orientation.

  - Rewrite the names to be less mysterious looking.  No more Xs.

  - Make the src type uniformly const void*, to allow for 888 (RGB) srcs.

This should be performance and pixel neutral.  (Please revert if it's not.)

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1626463002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1626463002
This commit is contained in:
mtklein 2016-01-22 07:42:53 -08:00 committed by Commit bot
parent 6c9cd55f00
commit 8bf7b79cf9
8 changed files with 112 additions and 107 deletions

View File

@ -10,7 +10,7 @@
class SwizzleBench : public Benchmark {
public:
SwizzleBench(const char* name, SkOpts::Swizzle_8888_8888 fn) : fName(name), fFn(fn) {}
SwizzleBench(const char* name, SkOpts::Swizzle_8888 fn) : fName(name), fFn(fn) {}
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
const char* onGetName() override { return fName; }
@ -23,10 +23,10 @@ public:
}
private:
const char* fName;
SkOpts::Swizzle_8888_8888 fFn;
SkOpts::Swizzle_8888 fFn;
};
DEF_BENCH(return new SwizzleBench("SkOpts::premul_xxxa", SkOpts::premul_xxxa));
DEF_BENCH(return new SwizzleBench("SkOpts::swaprb_xxxa", SkOpts::swaprb_xxxa));
DEF_BENCH(return new SwizzleBench("SkOpts::premul_swaprb_xxxa", SkOpts::premul_swaprb_xxxa));
DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_rgbA", SkOpts::RGBA_to_rgbA));
DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_bgrA", SkOpts::RGBA_to_bgrA));
DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_BGRA", SkOpts::RGBA_to_BGRA));

View File

@ -333,7 +333,7 @@ static void fast_swizzle_bgra_to_n32_unpremul(
// These swizzles trust that the alpha value is already 0xFF.
#ifdef SK_PMCOLOR_IS_RGBA
SkOpts::swaprb_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width);
SkOpts::RGBA_to_BGRA((uint32_t*) dst, src + offset, width);
#else
memcpy(dst, src + offset, width * bpp);
#endif
@ -361,9 +361,9 @@ static void fast_swizzle_bgra_to_n32_premul(
SkASSERT(deltaSrc == bpp);
#ifdef SK_PMCOLOR_IS_RGBA
SkOpts::premul_swaprb_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width);
SkOpts::RGBA_to_bgrA((uint32_t*) dst, src + offset, width);
#else
SkOpts::premul_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width);
SkOpts::RGBA_to_rgbA((uint32_t*) dst, src + offset, width);
#endif
}
@ -419,9 +419,9 @@ static void fast_swizzle_rgba_to_n32_premul(
SkASSERT(deltaSrc == bpp);
#ifdef SK_PMCOLOR_IS_RGBA
SkOpts::premul_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width);
SkOpts::RGBA_to_rgbA((uint32_t*) dst, src + offset, width);
#else
SkOpts::premul_swaprb_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width);
SkOpts::RGBA_to_bgrA((uint32_t*) dst, src + offset, width);
#endif
}
@ -450,7 +450,7 @@ static void fast_swizzle_rgba_to_n32_unpremul(
#ifdef SK_PMCOLOR_IS_RGBA
memcpy(dst, src + offset, width * bpp);
#else
SkOpts::swaprb_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width);
SkOpts::RGBA_to_BGRA((uint32_t*) dst, src + offset, width);
#endif
}

View File

@ -81,9 +81,9 @@ namespace SkOpts {
decltype(matrix_scale_translate) matrix_scale_translate = sk_default::matrix_scale_translate;
decltype(matrix_affine) matrix_affine = sk_default::matrix_affine;
decltype( premul_xxxa) premul_xxxa = sk_default:: premul_xxxa;
decltype( swaprb_xxxa) swaprb_xxxa = sk_default:: swaprb_xxxa;
decltype(premul_swaprb_xxxa) premul_swaprb_xxxa = sk_default::premul_swaprb_xxxa;
decltype(RGBA_to_BGRA) RGBA_to_BGRA = sk_default::RGBA_to_BGRA;
decltype(RGBA_to_rgbA) RGBA_to_rgbA = sk_default::RGBA_to_rgbA;
decltype(RGBA_to_bgrA) RGBA_to_bgrA = sk_default::RGBA_to_bgrA;
// Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
void Init_ssse3();

View File

@ -58,10 +58,11 @@ namespace SkOpts {
extern SkMatrix::MapPtsProc matrix_translate, matrix_scale_translate, matrix_affine;
typedef void (*Swizzle_8888_8888)(uint32_t[], const uint32_t[], int);
extern Swizzle_8888_8888 premul_xxxa, // BGRA -> bgrA or RGBA -> rgbA
swaprb_xxxa, // BGRA -> RGBA or RGBA -> BGRA
premul_swaprb_xxxa; // BGRA -> rgbA or RGBA -> bgrA
// Swizzle input into some sort of 8888 pixel, {premul,unpremul} x {rgba,bgra}.
typedef void (*Swizzle_8888)(uint32_t*, const void*, int);
extern Swizzle_8888 RGBA_to_BGRA, // i.e. just swap RB
RGBA_to_rgbA, // i.e. just premultiply
RGBA_to_bgrA; // i.e. swap RB and premultiply
}
#endif//SkOpts_DEFINED

View File

@ -49,8 +49,8 @@ namespace SkOpts {
matrix_scale_translate = sk_neon::matrix_scale_translate;
matrix_affine = sk_neon::matrix_affine;
premul_xxxa = sk_neon::premul_xxxa;
premul_swaprb_xxxa = sk_neon::premul_swaprb_xxxa;
swaprb_xxxa = sk_neon::swaprb_xxxa;
RGBA_to_BGRA = sk_neon::RGBA_to_BGRA;
RGBA_to_rgbA = sk_neon::RGBA_to_rgbA;
RGBA_to_bgrA = sk_neon::RGBA_to_bgrA;
}
}

View File

@ -18,8 +18,8 @@ namespace SkOpts {
blit_mask_d32_a8 = sk_ssse3::blit_mask_d32_a8;
color_cube_filter_span = sk_ssse3::color_cube_filter_span;
premul_xxxa = sk_ssse3::premul_xxxa;
premul_swaprb_xxxa = sk_ssse3::premul_swaprb_xxxa;
swaprb_xxxa = sk_ssse3::swaprb_xxxa;
RGBA_to_BGRA = sk_ssse3::RGBA_to_BGRA;
RGBA_to_rgbA = sk_ssse3::RGBA_to_rgbA;
RGBA_to_bgrA = sk_ssse3::RGBA_to_bgrA;
}
}

View File

@ -12,18 +12,33 @@
namespace SK_OPTS_NS {
// These variable names in these functions just pretend the input is BGRA.
// They work fine with both RGBA and BGRA.
static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) {
static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
auto src = (const uint32_t*)vsrc;
for (int i = 0; i < count; i++) {
uint8_t a = src[i] >> 24,
r = src[i] >> 16,
b = src[i] >> 16,
g = src[i] >> 8,
b = src[i] >> 0;
r = (r*a+127)/255;
g = (g*a+127)/255;
r = src[i] >> 0;
b = (b*a+127)/255;
g = (g*a+127)/255;
r = (r*a+127)/255;
dst[i] = (uint32_t)a << 24
| (uint32_t)b << 16
| (uint32_t)g << 8
| (uint32_t)r << 0;
}
}
static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
auto src = (const uint32_t*)vsrc;
for (int i = 0; i < count; i++) {
uint8_t a = src[i] >> 24,
b = src[i] >> 16,
g = src[i] >> 8,
r = src[i] >> 0;
b = (b*a+127)/255;
g = (g*a+127)/255;
r = (r*a+127)/255;
dst[i] = (uint32_t)a << 24
| (uint32_t)r << 16
| (uint32_t)g << 8
@ -31,32 +46,17 @@ static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count
}
}
static void premul_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) {
static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
auto src = (const uint32_t*)vsrc;
for (int i = 0; i < count; i++) {
uint8_t a = src[i] >> 24,
r = src[i] >> 16,
b = src[i] >> 16,
g = src[i] >> 8,
b = src[i] >> 0;
r = (r*a+127)/255;
g = (g*a+127)/255;
b = (b*a+127)/255;
r = src[i] >> 0;
dst[i] = (uint32_t)a << 24
| (uint32_t)b << 16
| (uint32_t)r << 16
| (uint32_t)g << 8
| (uint32_t)r << 0;
}
}
static void swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) {
for (int i = 0; i < count; i++) {
uint8_t a = src[i] >> 24,
r = src[i] >> 16,
g = src[i] >> 8,
b = src[i] >> 0;
dst[i] = (uint32_t)a << 24
| (uint32_t)b << 16
| (uint32_t)g << 8
| (uint32_t)r << 0;
| (uint32_t)b << 0;
}
}
@ -92,30 +92,31 @@ static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
}
template <bool kSwapRB>
static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {
static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
auto src = (const uint32_t*)vsrc;
while (count >= 8) {
// Load 8 pixels.
uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);
uint8x8_t a = bgra.val[3],
r = bgra.val[2],
b = bgra.val[2],
g = bgra.val[1],
b = bgra.val[0];
r = bgra.val[0];
// Premultiply.
r = scale(r, a);
g = scale(g, a);
b = scale(b, a);
g = scale(g, a);
r = scale(r, a);
// Store 8 premultiplied pixels.
if (kSwapRB) {
bgra.val[2] = b;
bgra.val[1] = g;
bgra.val[0] = r;
} else {
bgra.val[2] = r;
bgra.val[1] = g;
bgra.val[0] = b;
} else {
bgra.val[2] = b;
bgra.val[1] = g;
bgra.val[0] = r;
}
vst4_u8((uint8_t*) dst, bgra);
src += 8;
@ -124,19 +125,20 @@ static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int
}
// Call portable code to finish up the tail of [0,8) pixels.
auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;
auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
proc(dst, src, count);
}
static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
premul_xxxa_should_swaprb<false>(dst, src, count);
static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
premul_should_swapRB<false>(dst, src, count);
}
static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
premul_xxxa_should_swaprb<true>(dst, src, count);
static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
premul_should_swapRB<true>(dst, src, count);
}
static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
auto src = (const uint32_t*)vsrc;
while (count >= 16) {
// Load 16 pixels.
uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src);
@ -165,13 +167,14 @@ static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
count -= 8;
}
swaprb_xxxa_portable(dst, src, count);
RGBA_to_BGRA_portable(dst, src, count);
}
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
template <bool kSwapRB>
static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {
static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
auto src = (const uint32_t*)vsrc;
auto premul8 = [](__m128i* lo, __m128i* hi) {
const __m128i zeros = _mm_setzero_si128();
@ -185,27 +188,27 @@ static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int
}
// Swizzle the pixels to 8-bit planar.
*lo = _mm_shuffle_epi8(*lo, planar); // bbbbgggg rrrraaaa
*hi = _mm_shuffle_epi8(*hi, planar); // BBBBGGGG RRRRAAAA
__m128i bg = _mm_unpacklo_epi32(*lo, *hi), // bbbbBBBB ggggGGGG
ra = _mm_unpackhi_epi32(*lo, *hi); // rrrrRRRR aaaaAAAA
*lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa
*hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA
__m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG
ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA
// Unpack to 16-bit planar.
__m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_B_B_B_
g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_G_G_G_
r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_R_R_R_
a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_A_A_A_
__m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_
g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_
b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
// Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);
g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);
r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);
g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);
b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);
// Repack into interlaced pixels.
bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BGBGBGBG
ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RARARARA
*lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bgrabgra
*hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BGRABGRA
rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG
ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA
*lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
*hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA
};
while (count >= 8) {
@ -236,46 +239,47 @@ static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int
}
// Call portable code to finish up the tail of [0,4) pixels.
auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;
auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
proc(dst, src, count);
}
static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
premul_xxxa_should_swaprb<false>(dst, src, count);
static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
premul_should_swapRB<false>(dst, src, count);
}
static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
premul_xxxa_should_swaprb<true>(dst, src, count);
static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
premul_should_swapRB<true>(dst, src, count);
}
static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
auto src = (const uint32_t*)vsrc;
const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
while (count >= 4) {
__m128i bgra = _mm_loadu_si128((const __m128i*) src);
__m128i rgba = _mm_shuffle_epi8(bgra, swapRB);
_mm_storeu_si128((__m128i*) dst, rgba);
__m128i rgba = _mm_loadu_si128((const __m128i*) src);
__m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
_mm_storeu_si128((__m128i*) dst, bgra);
src += 4;
dst += 4;
count -= 4;
}
swaprb_xxxa_portable(dst, src, count);
RGBA_to_BGRA_portable(dst, src, count);
}
#else
static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
premul_xxxa_portable(dst, src, count);
static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
RGBA_to_rgbA_portable(dst, src, count);
}
static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
premul_swaprb_xxxa_portable(dst, src, count);
static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
RGBA_to_bgrA_portable(dst, src, count);
}
static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
swaprb_xxxa_portable(dst, src, count);
static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
RGBA_to_BGRA_portable(dst, src, count);
}
#endif

View File

@ -132,28 +132,28 @@ DEF_TEST(SwizzleOpts, r) {
// forall c, c*255 == c, c*0 == 0
for (int c = 0; c <= 255; c++) {
src = (255<<24) | c;
SkOpts::premul_xxxa(&dst, &src, 1);
SkOpts::RGBA_to_rgbA(&dst, &src, 1);
REPORTER_ASSERT(r, dst == src);
SkOpts::premul_swaprb_xxxa(&dst, &src, 1);
SkOpts::RGBA_to_bgrA(&dst, &src, 1);
REPORTER_ASSERT(r, dst == (uint32_t)((255<<24) | (c<<16)));
src = (0<<24) | c;
SkOpts::premul_xxxa(&dst, &src, 1);
SkOpts::RGBA_to_rgbA(&dst, &src, 1);
REPORTER_ASSERT(r, dst == 0);
SkOpts::premul_swaprb_xxxa(&dst, &src, 1);
SkOpts::RGBA_to_bgrA(&dst, &src, 1);
REPORTER_ASSERT(r, dst == 0);
}
// check a totally arbitrary color
src = 0xFACEB004;
SkOpts::premul_xxxa(&dst, &src, 1);
SkOpts::RGBA_to_rgbA(&dst, &src, 1);
REPORTER_ASSERT(r, dst == 0xFACAAD04);
// swap red and blue
SkOpts::swaprb_xxxa(&dst, &src, 1);
SkOpts::RGBA_to_BGRA(&dst, &src, 1);
REPORTER_ASSERT(r, dst == 0xFA04B0CE);
// all together now
SkOpts::premul_swaprb_xxxa(&dst, &src, 1);
SkOpts::RGBA_to_bgrA(&dst, &src, 1);
REPORTER_ASSERT(r, dst == 0xFA04ADCA);
}