diff --git a/bench/SwizzleBench.cpp b/bench/SwizzleBench.cpp index 13b2003f9a..922c276dbc 100644 --- a/bench/SwizzleBench.cpp +++ b/bench/SwizzleBench.cpp @@ -10,7 +10,7 @@ class SwizzleBench : public Benchmark { public: - SwizzleBench(const char* name, SkOpts::Swizzle_8888_8888 fn) : fName(name), fFn(fn) {} + SwizzleBench(const char* name, SkOpts::Swizzle_8888 fn) : fName(name), fFn(fn) {} bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } const char* onGetName() override { return fName; } @@ -23,10 +23,10 @@ public: } private: const char* fName; - SkOpts::Swizzle_8888_8888 fFn; + SkOpts::Swizzle_8888 fFn; }; -DEF_BENCH(return new SwizzleBench("SkOpts::premul_xxxa", SkOpts::premul_xxxa)); -DEF_BENCH(return new SwizzleBench("SkOpts::swaprb_xxxa", SkOpts::swaprb_xxxa)); -DEF_BENCH(return new SwizzleBench("SkOpts::premul_swaprb_xxxa", SkOpts::premul_swaprb_xxxa)); +DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_rgbA", SkOpts::RGBA_to_rgbA)); +DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_bgrA", SkOpts::RGBA_to_bgrA)); +DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_BGRA", SkOpts::RGBA_to_BGRA)); diff --git a/src/codec/SkSwizzler.cpp b/src/codec/SkSwizzler.cpp index 24cb65fe28..f84b83e23a 100644 --- a/src/codec/SkSwizzler.cpp +++ b/src/codec/SkSwizzler.cpp @@ -333,7 +333,7 @@ static void fast_swizzle_bgra_to_n32_unpremul( // These swizzles trust that the alpha value is already 0xFF. #ifdef SK_PMCOLOR_IS_RGBA - SkOpts::swaprb_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width); + SkOpts::RGBA_to_BGRA((uint32_t*) dst, src + offset, width); #else memcpy(dst, src + offset, width * bpp); #endif @@ -361,9 +361,9 @@ static void fast_swizzle_bgra_to_n32_premul( SkASSERT(deltaSrc == bpp); #ifdef SK_PMCOLOR_IS_RGBA - SkOpts::premul_swaprb_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width); + SkOpts::RGBA_to_bgrA((uint32_t*) dst, src + offset, width); #else - SkOpts::premul_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width); + SkOpts::RGBA_to_rgbA((uint32_t*) dst, src + offset, width); #endif } @@ -419,9 +419,9 @@ static void fast_swizzle_rgba_to_n32_premul( SkASSERT(deltaSrc == bpp); #ifdef SK_PMCOLOR_IS_RGBA - SkOpts::premul_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width); + SkOpts::RGBA_to_rgbA((uint32_t*) dst, src + offset, width); #else - SkOpts::premul_swaprb_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width); + SkOpts::RGBA_to_bgrA((uint32_t*) dst, src + offset, width); #endif } @@ -450,7 +450,7 @@ static void fast_swizzle_rgba_to_n32_unpremul( #ifdef SK_PMCOLOR_IS_RGBA memcpy(dst, src + offset, width * bpp); #else - SkOpts::swaprb_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width); + SkOpts::RGBA_to_BGRA((uint32_t*) dst, src + offset, width); #endif } diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp index 5f1a36c1be..c078d8590d 100644 --- a/src/core/SkOpts.cpp +++ b/src/core/SkOpts.cpp @@ -81,9 +81,9 @@ namespace SkOpts { decltype(matrix_scale_translate) matrix_scale_translate = sk_default::matrix_scale_translate; decltype(matrix_affine) matrix_affine = sk_default::matrix_affine; - decltype( premul_xxxa) premul_xxxa = sk_default:: premul_xxxa; - decltype( swaprb_xxxa) swaprb_xxxa = sk_default:: swaprb_xxxa; - decltype(premul_swaprb_xxxa) premul_swaprb_xxxa = sk_default::premul_swaprb_xxxa; + decltype(RGBA_to_BGRA) RGBA_to_BGRA = sk_default::RGBA_to_BGRA; + decltype(RGBA_to_rgbA) RGBA_to_rgbA = sk_default::RGBA_to_rgbA; + decltype(RGBA_to_bgrA) RGBA_to_bgrA = sk_default::RGBA_to_bgrA; // Each Init_foo() is defined in src/opts/SkOpts_foo.cpp. void Init_ssse3(); diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h index 85e38fe139..a622c1acdc 100644 --- a/src/core/SkOpts.h +++ b/src/core/SkOpts.h @@ -58,10 +58,11 @@ namespace SkOpts { extern SkMatrix::MapPtsProc matrix_translate, matrix_scale_translate, matrix_affine; - typedef void (*Swizzle_8888_8888)(uint32_t[], const uint32_t[], int); - extern Swizzle_8888_8888 premul_xxxa, // BGRA -> bgrA or RGBA -> rgbA - swaprb_xxxa, // BGRA -> RGBA or RGBA -> BGRA - premul_swaprb_xxxa; // BGRA -> rgbA or RGBA -> bgrA + // Swizzle input into some sort of 8888 pixel, {premul,unpremul} x {rgba,bgra}. + typedef void (*Swizzle_8888)(uint32_t*, const void*, int); + extern Swizzle_8888 RGBA_to_BGRA, // i.e. just swap RB + RGBA_to_rgbA, // i.e. just premultiply + RGBA_to_bgrA; // i.e. swap RB and premultiply } #endif//SkOpts_DEFINED diff --git a/src/opts/SkOpts_neon.cpp b/src/opts/SkOpts_neon.cpp index 3a07ebb765..97af416f5d 100644 --- a/src/opts/SkOpts_neon.cpp +++ b/src/opts/SkOpts_neon.cpp @@ -49,8 +49,8 @@ namespace SkOpts { matrix_scale_translate = sk_neon::matrix_scale_translate; matrix_affine = sk_neon::matrix_affine; - premul_xxxa = sk_neon::premul_xxxa; - premul_swaprb_xxxa = sk_neon::premul_swaprb_xxxa; - swaprb_xxxa = sk_neon::swaprb_xxxa; + RGBA_to_BGRA = sk_neon::RGBA_to_BGRA; + RGBA_to_rgbA = sk_neon::RGBA_to_rgbA; + RGBA_to_bgrA = sk_neon::RGBA_to_bgrA; } } diff --git a/src/opts/SkOpts_ssse3.cpp b/src/opts/SkOpts_ssse3.cpp index 5378377d1e..96e8493bfc 100644 --- a/src/opts/SkOpts_ssse3.cpp +++ b/src/opts/SkOpts_ssse3.cpp @@ -18,8 +18,8 @@ namespace SkOpts { blit_mask_d32_a8 = sk_ssse3::blit_mask_d32_a8; color_cube_filter_span = sk_ssse3::color_cube_filter_span; - premul_xxxa = sk_ssse3::premul_xxxa; - premul_swaprb_xxxa = sk_ssse3::premul_swaprb_xxxa; - swaprb_xxxa = sk_ssse3::swaprb_xxxa; + RGBA_to_BGRA = sk_ssse3::RGBA_to_BGRA; + RGBA_to_rgbA = sk_ssse3::RGBA_to_rgbA; + RGBA_to_bgrA = sk_ssse3::RGBA_to_bgrA; } } diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h index b0cf4cad53..8d1be84df2 100644 --- a/src/opts/SkSwizzler_opts.h +++ b/src/opts/SkSwizzler_opts.h @@ -12,18 +12,33 @@ namespace SK_OPTS_NS { -// These variable names in these functions just pretend the input is BGRA. -// They work fine with both RGBA and BGRA. - -static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) { +static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { + auto src = (const uint32_t*)vsrc; for (int i = 0; i < count; i++) { uint8_t a = src[i] >> 24, - r = src[i] >> 16, + b = src[i] >> 16, g = src[i] >> 8, - b = src[i] >> 0; - r = (r*a+127)/255; - g = (g*a+127)/255; + r = src[i] >> 0; b = (b*a+127)/255; + g = (g*a+127)/255; + r = (r*a+127)/255; + dst[i] = (uint32_t)a << 24 + | (uint32_t)b << 16 + | (uint32_t)g << 8 + | (uint32_t)r << 0; + } +} + +static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) { + auto src = (const uint32_t*)vsrc; + for (int i = 0; i < count; i++) { + uint8_t a = src[i] >> 24, + b = src[i] >> 16, + g = src[i] >> 8, + r = src[i] >> 0; + b = (b*a+127)/255; + g = (g*a+127)/255; + r = (r*a+127)/255; dst[i] = (uint32_t)a << 24 | (uint32_t)r << 16 | (uint32_t)g << 8 @@ -31,32 +46,17 @@ static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count } } -static void premul_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) { +static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) { + auto src = (const uint32_t*)vsrc; for (int i = 0; i < count; i++) { uint8_t a = src[i] >> 24, - r = src[i] >> 16, + b = src[i] >> 16, g = src[i] >> 8, - b = src[i] >> 0; - r = (r*a+127)/255; - g = (g*a+127)/255; - b = (b*a+127)/255; + r = src[i] >> 0; dst[i] = (uint32_t)a << 24 - | (uint32_t)b << 16 + | (uint32_t)r << 16 | (uint32_t)g << 8 - | (uint32_t)r << 0; - } -} - -static void swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) { - for (int i = 0; i < count; i++) { - uint8_t a = src[i] >> 24, - r = src[i] >> 16, - g = src[i] >> 8, - b = src[i] >> 0; - dst[i] = (uint32_t)a << 24 - | (uint32_t)b << 16 - | (uint32_t)g << 8 - | (uint32_t)r << 0; + | (uint32_t)b << 0; } } @@ -92,30 +92,31 @@ static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { } template -static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { +static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { + auto src = (const uint32_t*)vsrc; while (count >= 8) { // Load 8 pixels. uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); uint8x8_t a = bgra.val[3], - r = bgra.val[2], + b = bgra.val[2], g = bgra.val[1], - b = bgra.val[0]; + r = bgra.val[0]; // Premultiply. - r = scale(r, a); - g = scale(g, a); b = scale(b, a); + g = scale(g, a); + r = scale(r, a); // Store 8 premultiplied pixels. if (kSwapRB) { - bgra.val[2] = b; - bgra.val[1] = g; - bgra.val[0] = r; - } else { bgra.val[2] = r; bgra.val[1] = g; bgra.val[0] = b; + } else { + bgra.val[2] = b; + bgra.val[1] = g; + bgra.val[0] = r; } vst4_u8((uint8_t*) dst, bgra); src += 8; @@ -124,19 +125,20 @@ static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int } // Call portable code to finish up the tail of [0,8) pixels. - auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; + auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; proc(dst, src, count); } -static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { - premul_xxxa_should_swaprb(dst, src, count); +static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { + premul_should_swapRB(dst, src, count); } -static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { - premul_xxxa_should_swaprb(dst, src, count); +static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { + premul_should_swapRB(dst, src, count); } -static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { +static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { + auto src = (const uint32_t*)vsrc; while (count >= 16) { // Load 16 pixels. uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src); @@ -165,13 +167,14 @@ static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { count -= 8; } - swaprb_xxxa_portable(dst, src, count); + RGBA_to_BGRA_portable(dst, src, count); } #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 template -static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { +static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { + auto src = (const uint32_t*)vsrc; auto premul8 = [](__m128i* lo, __m128i* hi) { const __m128i zeros = _mm_setzero_si128(); @@ -185,27 +188,27 @@ static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int } // Swizzle the pixels to 8-bit planar. - *lo = _mm_shuffle_epi8(*lo, planar); // bbbbgggg rrrraaaa - *hi = _mm_shuffle_epi8(*hi, planar); // BBBBGGGG RRRRAAAA - __m128i bg = _mm_unpacklo_epi32(*lo, *hi), // bbbbBBBB ggggGGGG - ra = _mm_unpackhi_epi32(*lo, *hi); // rrrrRRRR aaaaAAAA + *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa + *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA + __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG + ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA // Unpack to 16-bit planar. - __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_B_B_B_ - g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_G_G_G_ - r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_R_R_R_ - a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_A_A_A_ + __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ + g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ + b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ + a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. - b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); - g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); + g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); + b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); // Repack into interlaced pixels. - bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BGBGBGBG - ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RARARARA - *lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bgrabgra - *hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BGRABGRA + rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG + ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA + *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba + *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA }; while (count >= 8) { @@ -236,46 +239,47 @@ static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int } // Call portable code to finish up the tail of [0,4) pixels. - auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; + auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; proc(dst, src, count); } -static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { - premul_xxxa_should_swaprb(dst, src, count); +static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { + premul_should_swapRB(dst, src, count); } -static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { - premul_xxxa_should_swaprb(dst, src, count); +static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { + premul_should_swapRB(dst, src, count); } -static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { +static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { + auto src = (const uint32_t*)vsrc; const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); while (count >= 4) { - __m128i bgra = _mm_loadu_si128((const __m128i*) src); - __m128i rgba = _mm_shuffle_epi8(bgra, swapRB); - _mm_storeu_si128((__m128i*) dst, rgba); + __m128i rgba = _mm_loadu_si128((const __m128i*) src); + __m128i bgra = _mm_shuffle_epi8(rgba, swapRB); + _mm_storeu_si128((__m128i*) dst, bgra); src += 4; dst += 4; count -= 4; } - swaprb_xxxa_portable(dst, src, count); + RGBA_to_BGRA_portable(dst, src, count); } #else -static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { - premul_xxxa_portable(dst, src, count); +static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { + RGBA_to_rgbA_portable(dst, src, count); } -static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { - premul_swaprb_xxxa_portable(dst, src, count); +static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { + RGBA_to_bgrA_portable(dst, src, count); } -static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { - swaprb_xxxa_portable(dst, src, count); +static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { + RGBA_to_BGRA_portable(dst, src, count); } #endif diff --git a/tests/SwizzlerTest.cpp b/tests/SwizzlerTest.cpp index f67cfeef7a..e1626d52f1 100644 --- a/tests/SwizzlerTest.cpp +++ b/tests/SwizzlerTest.cpp @@ -132,28 +132,28 @@ DEF_TEST(SwizzleOpts, r) { // forall c, c*255 == c, c*0 == 0 for (int c = 0; c <= 255; c++) { src = (255<<24) | c; - SkOpts::premul_xxxa(&dst, &src, 1); + SkOpts::RGBA_to_rgbA(&dst, &src, 1); REPORTER_ASSERT(r, dst == src); - SkOpts::premul_swaprb_xxxa(&dst, &src, 1); + SkOpts::RGBA_to_bgrA(&dst, &src, 1); REPORTER_ASSERT(r, dst == (uint32_t)((255<<24) | (c<<16))); src = (0<<24) | c; - SkOpts::premul_xxxa(&dst, &src, 1); + SkOpts::RGBA_to_rgbA(&dst, &src, 1); REPORTER_ASSERT(r, dst == 0); - SkOpts::premul_swaprb_xxxa(&dst, &src, 1); + SkOpts::RGBA_to_bgrA(&dst, &src, 1); REPORTER_ASSERT(r, dst == 0); } // check a totally arbitrary color src = 0xFACEB004; - SkOpts::premul_xxxa(&dst, &src, 1); + SkOpts::RGBA_to_rgbA(&dst, &src, 1); REPORTER_ASSERT(r, dst == 0xFACAAD04); // swap red and blue - SkOpts::swaprb_xxxa(&dst, &src, 1); + SkOpts::RGBA_to_BGRA(&dst, &src, 1); REPORTER_ASSERT(r, dst == 0xFA04B0CE); // all together now - SkOpts::premul_swaprb_xxxa(&dst, &src, 1); + SkOpts::RGBA_to_bgrA(&dst, &src, 1); REPORTER_ASSERT(r, dst == 0xFA04ADCA); }