NEON optimizations for gray -> RGBA (or BGRA) conversions

Swizzle Bench Runtime
Nexus 6P 0.32x
Nexus 9  0.89x

PNG Decode Time (for test set of gray encoded PNGs)
Nexus 6P 0.88x
Nexus 9  0.91x

BUG=skia:4767
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1656383002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1656383002
This commit is contained in:
msarett 2016-02-02 12:59:45 -08:00 committed by Commit bot
parent c92159c825
commit 2eff71c9b5
7 changed files with 79 additions and 1 deletions

View File

@ -32,3 +32,4 @@ DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_bgrA", SkOpts::RGBA_to_bgrA))
DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_BGRA", SkOpts::RGBA_to_BGRA));
DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_RGB1", SkOpts::RGB_to_RGB1));
DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_BGR1", SkOpts::RGB_to_BGR1));
DEF_BENCH(return new SwizzleBench("SkOpts::gray_to_RGB1", SkOpts::gray_to_RGB1));

View File

@ -270,6 +270,19 @@ static void swizzle_gray_to_n32(
}
}
static void fast_swizzle_gray_to_n32(
void* dst, const uint8_t* src, int width, int bpp, int deltaSrc, int offset,
const SkPMColor ctable[]) {
// This function must not be called if we are sampling. If we are not
// sampling, deltaSrc should equal bpp.
SkASSERT(deltaSrc == bpp);
// Note that there is no need to distinguish between RGB and BGR.
// Each color channel will get the same value.
SkOpts::gray_to_RGB1((uint32_t*) dst, src + offset, width);
}
static void swizzle_gray_to_565(
void* SK_RESTRICT dstRow, const uint8_t* SK_RESTRICT src, int dstWidth,
int bytesPerPixel, int deltaSrc, int offset, const SkPMColor ctable[]) {
@ -639,6 +652,7 @@ SkSwizzler* SkSwizzler::CreateSwizzler(SkSwizzler::SrcConfig sc,
switch (dstInfo.colorType()) {
case kN32_SkColorType:
proc = &swizzle_gray_to_n32;
fastProc = &fast_swizzle_gray_to_n32;
break;
case kGray_8_SkColorType:
proc = &sample1;

View File

@ -84,6 +84,7 @@ namespace SkOpts {
decltype(RGBA_to_bgrA) RGBA_to_bgrA = sk_default::RGBA_to_bgrA;
decltype(RGB_to_RGB1) RGB_to_RGB1 = sk_default::RGB_to_RGB1;
decltype(RGB_to_BGR1) RGB_to_BGR1 = sk_default::RGB_to_BGR1;
decltype(gray_to_RGB1) gray_to_RGB1 = sk_default::gray_to_RGB1;
// Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
void Init_ssse3();

View File

@ -61,7 +61,8 @@ namespace SkOpts {
RGBA_to_rgbA, // i.e. just premultiply
RGBA_to_bgrA, // i.e. swap RB and premultiply
RGB_to_RGB1, // i.e. insert an opaque alpha
RGB_to_BGR1; // i.e. swap RB and insert an opaque alpha
RGB_to_BGR1, // i.e. swap RB and insert an opaque alpha
gray_to_RGB1; // i.e. set color channels to same value + an opaque alpha
}
#endif//SkOpts_DEFINED

View File

@ -52,5 +52,6 @@ namespace SkOpts {
RGBA_to_bgrA = sk_neon::RGBA_to_bgrA;
RGB_to_RGB1 = sk_neon::RGB_to_RGB1;
RGB_to_BGR1 = sk_neon::RGB_to_BGR1;
gray_to_RGB1 = sk_neon::gray_to_RGB1;
}
}

View File

@ -23,5 +23,6 @@ namespace SkOpts {
RGBA_to_bgrA = sk_ssse3::RGBA_to_bgrA;
RGB_to_RGB1 = sk_ssse3::RGB_to_RGB1;
RGB_to_BGR1 = sk_ssse3::RGB_to_BGR1;
gray_to_RGB1 = sk_ssse3::gray_to_RGB1;
}
}

View File

@ -88,6 +88,16 @@ static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
}
}
static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
const uint8_t* src = (const uint8_t*)vsrc;
for (int i = 0; i < count; i++) {
dst[i] = (uint32_t)0xFF << 24
| (uint32_t)src[i] << 16
| (uint32_t)src[i] << 8
| (uint32_t)src[i] << 0;
}
}
#if defined(SK_ARM_HAS_NEON)
// Rounded divide by 255, (x + 127) / 255
@ -260,6 +270,47 @@ static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
insert_alpha_should_swaprb<true>(dst, src, count);
}
static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
const uint8_t* src = (const uint8_t*) vsrc;
while (count >= 16) {
// Load 16 pixels.
uint8x16_t gray = vld1q_u8(src);
// Set each of the color channels.
uint8x16x4_t rgba;
rgba.val[0] = gray;
rgba.val[1] = gray;
rgba.val[2] = gray;
rgba.val[3] = vdupq_n_u8(0xFF);
// Store 16 pixels.
vst4q_u8((uint8_t*) dst, rgba);
src += 16;
dst += 16;
count -= 16;
}
if (count >= 8) {
// Load 8 pixels.
uint8x8_t gray = vld1_u8(src);
// Set each of the color channels.
uint8x8x4_t rgba;
rgba.val[0] = gray;
rgba.val[1] = gray;
rgba.val[2] = gray;
rgba.val[3] = vdup_n_u8(0xFF);
// Store 8 pixels.
vst4_u8((uint8_t*) dst, rgba);
src += 8;
dst += 8;
count -= 8;
}
gray_to_RGB1_portable(dst, src, count);
}
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
template <bool kSwapRB>
@ -401,6 +452,10 @@ static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
insert_alpha_should_swaprb<true>(dst, src, count);
}
static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
gray_to_RGB1_portable(dst, src, count);
}
#else
static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
@ -423,6 +478,10 @@ static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
RGB_to_BGR1_portable(dst, src, count);
}
static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
gray_to_RGB1_portable(dst, src, count);
}
#endif
}