SSE optimizations for GrayAlpha -> RGBA/BGRA Premul/Unpremul

Swizzle Runtime (Dell Venue 8)
Unpremul 0.17x
Premul   0.20x

PNG Decode Runtime on GrayAlpha Encoded PNGs (Dell Venue 8)
Unpremul Regular  0.91x
Unpremul ZeroInit 0.92x
Premul   Regular  0.84x
Premul   ZeroInit 0.85x

BUG=skia:4767
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1666853002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1666853002
This commit is contained in:
msarett 2016-02-03 15:28:35 -08:00 committed by Commit bot
parent de3a726ad3
commit 095742419d
2 changed files with 61 additions and 8 deletions

View File

@ -33,3 +33,5 @@ DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_BGRA", SkOpts::RGBA_to_BGRA))
DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_RGB1", SkOpts::RGB_to_RGB1)); DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_RGB1", SkOpts::RGB_to_RGB1));
DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_BGR1", SkOpts::RGB_to_BGR1)); DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_BGR1", SkOpts::RGB_to_BGR1));
DEF_BENCH(return new SwizzleBench("SkOpts::gray_to_RGB1", SkOpts::gray_to_RGB1)); DEF_BENCH(return new SwizzleBench("SkOpts::gray_to_RGB1", SkOpts::gray_to_RGB1));
DEF_BENCH(return new SwizzleBench("SkOpts::grayA_to_RGBA", SkOpts::grayA_to_RGBA));
DEF_BENCH(return new SwizzleBench("SkOpts::grayA_to_rgbA", SkOpts::grayA_to_rgbA));

View File

@ -403,14 +403,22 @@ static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
// Scale a byte by another.
// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
static __m128i scale(__m128i x, __m128i y) {
const __m128i _128 = _mm_set1_epi16(128);
const __m128i _257 = _mm_set1_epi16(257);
// (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
}
template <bool kSwapRB> template <bool kSwapRB>
static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
auto src = (const uint32_t*)vsrc; auto src = (const uint32_t*)vsrc;
auto premul8 = [](__m128i* lo, __m128i* hi) { auto premul8 = [](__m128i* lo, __m128i* hi) {
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
const __m128i _128 = _mm_set1_epi16(128);
const __m128i _257 = _mm_set1_epi16(257);
__m128i planar; __m128i planar;
if (kSwapRB) { if (kSwapRB) {
planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
@ -430,10 +438,10 @@ static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
// Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. // Premultiply!
r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); r = scale(r, a);
g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); g = scale(g, a);
b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); b = scale(b, a);
// Repack into interlaced pixels. // Repack into interlaced pixels.
rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG
@ -572,11 +580,54 @@ static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
gray_to_RGB1_portable(dst, src, count); gray_to_RGB1_portable(dst, src, count);
} }
static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
const uint8_t* src = (const uint8_t*) vsrc;
while (count >= 8) {
__m128i ga = _mm_loadu_si128((const __m128i*) src);
__m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
_mm_slli_epi16(ga, 8));
__m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
__m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
_mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
_mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
src += 8*2;
dst += 8;
count -= 8;
}
grayA_to_RGBA_portable(dst, src, count); grayA_to_RGBA_portable(dst, src, count);
} }
static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
const uint8_t* src = (const uint8_t*) vsrc;
while (count >= 8) {
__m128i grayA = _mm_loadu_si128((const __m128i*) src);
__m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
__m128i a0 = _mm_srli_epi16(grayA, 8);
// Premultiply
g0 = scale(g0, a0);
__m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
__m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
__m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
__m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
_mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
_mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
src += 8*2;
dst += 8;
count -= 8;
}
grayA_to_rgbA_portable(dst, src, count); grayA_to_rgbA_portable(dst, src, count);
} }