Optimize CMYK->RGBA (BGRA) transform for jpeg decodes

Swizzle Bench Runtime
Nexus 6P     0.14x
Dell Venue 8 0.12x

CMYK Jpeg Decode Runtime
Nexus 6P     0.81x
Dell Venue 8 0.85x

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1676773003
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1676773003
This commit is contained in:
msarett 2016-02-08 13:26:25 -08:00 committed by Commit bot
parent 3125565804
commit c5c322d8ec
7 changed files with 227 additions and 32 deletions

View File

@ -35,3 +35,5 @@ DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_BGR1", SkOpts::RGB_to_BGR1));
DEF_BENCH(return new SwizzleBench("SkOpts::gray_to_RGB1", SkOpts::gray_to_RGB1));
DEF_BENCH(return new SwizzleBench("SkOpts::grayA_to_RGBA", SkOpts::grayA_to_RGBA));
DEF_BENCH(return new SwizzleBench("SkOpts::grayA_to_rgbA", SkOpts::grayA_to_rgbA));
DEF_BENCH(return new SwizzleBench("SkOpts::inverted_CMYK_to_RGB1", SkOpts::inverted_CMYK_to_RGB1));
DEF_BENCH(return new SwizzleBench("SkOpts::inverted_CMYK_to_BGR1", SkOpts::inverted_CMYK_to_BGR1));

View File

@ -592,6 +592,21 @@ static void swizzle_cmyk_to_n32(
}
}
static void fast_swizzle_cmyk_to_n32(
void* dst, const uint8_t* src, int width, int bpp, int deltaSrc, int offset,
const SkPMColor ctable[]) {
// This function must not be called if we are sampling. If we are not
// sampling, deltaSrc should equal bpp.
SkASSERT(deltaSrc == bpp);
#ifdef SK_PMCOLOR_IS_RGBA
SkOpts::inverted_CMYK_to_RGB1((uint32_t*) dst, src + offset, width);
#else
SkOpts::inverted_CMYK_to_BGR1((uint32_t*) dst, src + offset, width);
#endif
}
static void swizzle_cmyk_to_565(
void* SK_RESTRICT dstRow, const uint8_t* SK_RESTRICT src, int dstWidth,
int bpp, int deltaSrc, int offset, const SkPMColor ctable[]) {
@ -811,6 +826,7 @@ SkSwizzler* SkSwizzler::CreateSwizzler(SkSwizzler::SrcConfig sc,
break;
case kRGB_565_SkColorType:
proc = &swizzle_rgb_to_565;
break;
default:
break;
}
@ -844,6 +860,7 @@ SkSwizzler* SkSwizzler::CreateSwizzler(SkSwizzler::SrcConfig sc,
switch (dstInfo.colorType()) {
case kN32_SkColorType:
proc = &swizzle_cmyk_to_n32;
fastProc = &fast_swizzle_cmyk_to_n32;
break;
case kRGB_565_SkColorType:
proc = &swizzle_cmyk_to_565;

View File

@ -79,14 +79,16 @@ namespace SkOpts {
decltype(matrix_scale_translate) matrix_scale_translate = sk_default::matrix_scale_translate;
decltype(matrix_affine) matrix_affine = sk_default::matrix_affine;
decltype(RGBA_to_BGRA) RGBA_to_BGRA = sk_default::RGBA_to_BGRA;
decltype(RGBA_to_rgbA) RGBA_to_rgbA = sk_default::RGBA_to_rgbA;
decltype(RGBA_to_bgrA) RGBA_to_bgrA = sk_default::RGBA_to_bgrA;
decltype(RGB_to_RGB1) RGB_to_RGB1 = sk_default::RGB_to_RGB1;
decltype(RGB_to_BGR1) RGB_to_BGR1 = sk_default::RGB_to_BGR1;
decltype(gray_to_RGB1) gray_to_RGB1 = sk_default::gray_to_RGB1;
decltype(grayA_to_RGBA) grayA_to_RGBA = sk_default::grayA_to_RGBA;
decltype(grayA_to_rgbA) grayA_to_rgbA = sk_default::grayA_to_rgbA;
decltype(RGBA_to_BGRA) RGBA_to_BGRA = sk_default::RGBA_to_BGRA;
decltype(RGBA_to_rgbA) RGBA_to_rgbA = sk_default::RGBA_to_rgbA;
decltype(RGBA_to_bgrA) RGBA_to_bgrA = sk_default::RGBA_to_bgrA;
decltype(RGB_to_RGB1) RGB_to_RGB1 = sk_default::RGB_to_RGB1;
decltype(RGB_to_BGR1) RGB_to_BGR1 = sk_default::RGB_to_BGR1;
decltype(gray_to_RGB1) gray_to_RGB1 = sk_default::gray_to_RGB1;
decltype(grayA_to_RGBA) grayA_to_RGBA = sk_default::grayA_to_RGBA;
decltype(grayA_to_rgbA) grayA_to_rgbA = sk_default::grayA_to_rgbA;
decltype(inverted_CMYK_to_RGB1) inverted_CMYK_to_RGB1 = sk_default::inverted_CMYK_to_RGB1;
decltype(inverted_CMYK_to_BGR1) inverted_CMYK_to_BGR1 = sk_default::inverted_CMYK_to_BGR1;
// Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
void Init_ssse3();

View File

@ -57,14 +57,16 @@ namespace SkOpts {
// Swizzle input into some sort of 8888 pixel, {premul,unpremul} x {rgba,bgra}.
typedef void (*Swizzle_8888)(uint32_t*, const void*, int);
extern Swizzle_8888 RGBA_to_BGRA, // i.e. just swap RB
RGBA_to_rgbA, // i.e. just premultiply
RGBA_to_bgrA, // i.e. swap RB and premultiply
RGB_to_RGB1, // i.e. insert an opaque alpha
RGB_to_BGR1, // i.e. swap RB and insert an opaque alpha
gray_to_RGB1, // i.e. expand to color channels + an opaque alpha
grayA_to_RGBA, // i.e. expand to color channels
grayA_to_rgbA; // i.e. expand to color channels and premultiply
extern Swizzle_8888 RGBA_to_BGRA, // i.e. just swap RB
RGBA_to_rgbA, // i.e. just premultiply
RGBA_to_bgrA, // i.e. swap RB and premultiply
RGB_to_RGB1, // i.e. insert an opaque alpha
RGB_to_BGR1, // i.e. swap RB and insert an opaque alpha
gray_to_RGB1, // i.e. expand to color channels + an opaque alpha
grayA_to_RGBA, // i.e. expand to color channels
grayA_to_rgbA, // i.e. expand to color channels and premultiply
inverted_CMYK_to_RGB1, // i.e. convert color space
inverted_CMYK_to_BGR1; // i.e. convert color space
}
#endif//SkOpts_DEFINED

View File

@ -47,13 +47,15 @@ namespace SkOpts {
matrix_scale_translate = sk_neon::matrix_scale_translate;
matrix_affine = sk_neon::matrix_affine;
RGBA_to_BGRA = sk_neon::RGBA_to_BGRA;
RGBA_to_rgbA = sk_neon::RGBA_to_rgbA;
RGBA_to_bgrA = sk_neon::RGBA_to_bgrA;
RGB_to_RGB1 = sk_neon::RGB_to_RGB1;
RGB_to_BGR1 = sk_neon::RGB_to_BGR1;
gray_to_RGB1 = sk_neon::gray_to_RGB1;
grayA_to_RGBA = sk_neon::grayA_to_RGBA;
grayA_to_rgbA = sk_neon::grayA_to_rgbA;
RGBA_to_BGRA = sk_neon::RGBA_to_BGRA;
RGBA_to_rgbA = sk_neon::RGBA_to_rgbA;
RGBA_to_bgrA = sk_neon::RGBA_to_bgrA;
RGB_to_RGB1 = sk_neon::RGB_to_RGB1;
RGB_to_BGR1 = sk_neon::RGB_to_BGR1;
gray_to_RGB1 = sk_neon::gray_to_RGB1;
grayA_to_RGBA = sk_neon::grayA_to_RGBA;
grayA_to_rgbA = sk_neon::grayA_to_rgbA;
inverted_CMYK_to_RGB1 = sk_neon::inverted_CMYK_to_RGB1;
inverted_CMYK_to_BGR1 = sk_neon::inverted_CMYK_to_BGR1;
}
}

View File

@ -18,13 +18,15 @@ namespace SkOpts {
blit_mask_d32_a8 = sk_ssse3::blit_mask_d32_a8;
color_cube_filter_span = sk_ssse3::color_cube_filter_span;
RGBA_to_BGRA = sk_ssse3::RGBA_to_BGRA;
RGBA_to_rgbA = sk_ssse3::RGBA_to_rgbA;
RGBA_to_bgrA = sk_ssse3::RGBA_to_bgrA;
RGB_to_RGB1 = sk_ssse3::RGB_to_RGB1;
RGB_to_BGR1 = sk_ssse3::RGB_to_BGR1;
gray_to_RGB1 = sk_ssse3::gray_to_RGB1;
grayA_to_RGBA = sk_ssse3::grayA_to_RGBA;
grayA_to_rgbA = sk_ssse3::grayA_to_rgbA;
RGBA_to_BGRA = sk_ssse3::RGBA_to_BGRA;
RGBA_to_rgbA = sk_ssse3::RGBA_to_rgbA;
RGBA_to_bgrA = sk_ssse3::RGBA_to_bgrA;
RGB_to_RGB1 = sk_ssse3::RGB_to_RGB1;
RGB_to_BGR1 = sk_ssse3::RGB_to_BGR1;
gray_to_RGB1 = sk_ssse3::gray_to_RGB1;
grayA_to_RGBA = sk_ssse3::grayA_to_RGBA;
grayA_to_rgbA = sk_ssse3::grayA_to_rgbA;
inverted_CMYK_to_RGB1 = sk_ssse3::inverted_CMYK_to_RGB1;
inverted_CMYK_to_BGR1 = sk_ssse3::inverted_CMYK_to_BGR1;
}
}

View File

@ -125,6 +125,41 @@ static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count)
}
}
static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
const uint32_t* src = (const uint32_t*)vsrc;
for (int i = 0; i < count; i++) {
uint8_t k = src[i] >> 24,
y = src[i] >> 16,
m = src[i] >> 8,
c = src[i] >> 0;
// See comments in SkSwizzler.cpp for details on the conversion formula.
uint8_t b = (y*k+127)/255,
g = (m*k+127)/255,
r = (c*k+127)/255;
dst[i] = (uint32_t)0xFF << 24
| (uint32_t) b << 16
| (uint32_t) g << 8
| (uint32_t) r << 0;
}
}
static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
const uint32_t* src = (const uint32_t*)vsrc;
for (int i = 0; i < count; i++) {
uint8_t k = src[i] >> 24,
y = src[i] >> 16,
m = src[i] >> 8,
c = src[i] >> 0;
uint8_t b = (y*k+127)/255,
g = (m*k+127)/255,
r = (c*k+127)/255;
dst[i] = (uint32_t)0xFF << 24
| (uint32_t) r << 16
| (uint32_t) g << 8
| (uint32_t) b << 0;
}
}
#if defined(SK_ARM_HAS_NEON)
// Rounded divide by 255, (x + 127) / 255
@ -401,6 +436,54 @@ static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
expand_grayA<true>(dst, src, count);
}
enum Format { kRGB1, kBGR1 };
template <Format format>
static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
auto src = (const uint32_t*)vsrc;
while (count >= 8) {
// Load 8 cmyk pixels.
uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
uint8x8_t k = pixels.val[3],
y = pixels.val[2],
m = pixels.val[1],
c = pixels.val[0];
// Scale to r, g, b.
uint8x8_t b = scale(y, k);
uint8x8_t g = scale(m, k);
uint8x8_t r = scale(c, k);
// Store 8 rgba pixels.
if (kBGR1 == format) {
pixels.val[3] = vdup_n_u8(0xFF);
pixels.val[2] = r;
pixels.val[1] = g;
pixels.val[0] = b;
} else {
pixels.val[3] = vdup_n_u8(0xFF);
pixels.val[2] = b;
pixels.val[1] = g;
pixels.val[0] = r;
}
vst4_u8((uint8_t*) dst, pixels);
src += 8;
dst += 8;
count -= 8;
}
auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
proc(dst, src, count);
}
static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
inverted_cmyk_to<kRGB1>(dst, src, count);
}
static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
inverted_cmyk_to<kBGR1>(dst, src, count);
}
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
// Scale a byte by another.
@ -631,6 +714,83 @@ static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
grayA_to_rgbA_portable(dst, src, count);
}
enum Format { kRGB1, kBGR1 };
template <Format format>
static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
auto src = (const uint32_t*)vsrc;
auto convert8 = [](__m128i* lo, __m128i* hi) {
const __m128i zeros = _mm_setzero_si128();
__m128i planar;
if (kBGR1 == format) {
planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
} else {
planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
}
// Swizzle the pixels to 8-bit planar.
*lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk
*hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK
__m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM
yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK
// Unpack to 16-bit planar.
__m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_
m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_
y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_
k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_
// Scale to r, g, b.
__m128i r = scale(c, k),
g = scale(m, k),
b = scale(y, k);
// Repack into interlaced pixels.
__m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG
ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1
*lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
*hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1
};
while (count >= 8) {
__m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
hi = _mm_loadu_si128((const __m128i*) (src + 4));
convert8(&lo, &hi);
_mm_storeu_si128((__m128i*) (dst + 0), lo);
_mm_storeu_si128((__m128i*) (dst + 4), hi);
src += 8;
dst += 8;
count -= 8;
}
if (count >= 4) {
__m128i lo = _mm_loadu_si128((const __m128i*) src),
hi = _mm_setzero_si128();
convert8(&lo, &hi);
_mm_storeu_si128((__m128i*) dst, lo);
src += 4;
dst += 4;
count -= 4;
}
auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
proc(dst, src, count);
}
static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
inverted_cmyk_to<kRGB1>(dst, src, count);
}
static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
inverted_cmyk_to<kBGR1>(dst, src, count);
}
#else
static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
@ -665,6 +825,14 @@ static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
grayA_to_rgbA_portable(dst, src, count);
}
static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
inverted_CMYK_to_RGB1_portable(dst, src, count);
}
static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
inverted_CMYK_to_BGR1_portable(dst, src, count);
}
#endif
}