diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp index 17cd20a1a2..7b9c043835 100644 --- a/src/opts/SkBlitRow_opts_SSE2.cpp +++ b/src/opts/SkBlitRow_opts_SSE2.cpp @@ -207,74 +207,14 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, count--; } - uint32_t src_scale = SkAlpha255To256(alpha); - const __m128i *s = reinterpret_cast(src); __m128i *d = reinterpret_cast<__m128i*>(dst); - __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); - __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) while (count >= 4) { // Load 4 pixels each of src and dest. __m128i src_pixel = _mm_loadu_si128(s); __m128i dst_pixel = _mm_load_si128(d); - // Get red and blue pixels into lower byte of each word. - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); - __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); - - // Get alpha and green into lower byte of each word. - __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); - __m128i src_ag = _mm_srli_epi16(src_pixel, 8); - - // Put per-pixel alpha in low byte of each word. - // After the following two statements, the dst_alpha looks like - // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) - __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); - dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); - - // dst_alpha = dst_alpha * src_scale - // Because src_scales are in the higher byte of each word and - // we use mulhi here, the resulting alpha values are already - // in the right place and don't need to be divided by 256. - // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) - dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); - - // Subtract alphas from 256, to get 1..256 - dst_alpha = _mm_sub_epi16(c_256, dst_alpha); - - // Multiply red and blue by dst pixel alpha. - dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); - // Multiply alpha and green by dst pixel alpha. - dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); - - // Multiply red and blue by global alpha. - // (4 x (0, rs.h, 0, bs.h)) - // where rs.h stands for the higher byte of r * src_scale, - // and bs.h the higher byte of b * src_scale. - // Again, because we use mulhi, the resuling red and blue - // values are already in the right place and don't need to - // be divided by 256. - src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); - // Multiply alpha and green by global alpha. - // (4 x (0, as.h, 0, gs.h)) - src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); - - // Divide by 256. - dst_rb = _mm_srli_epi16(dst_rb, 8); - - // Mask out low bits (goodies already in the right place; no need to divide) - dst_ag = _mm_andnot_si128(rb_mask, dst_ag); - // Shift alpha and green to higher byte of each word. - // (4 x (as.h, 0, gs.h, 0)) - src_ag = _mm_slli_epi16(src_ag, 8); - - // Combine back into RGBA. - dst_pixel = _mm_or_si128(dst_rb, dst_ag); - src_pixel = _mm_or_si128(src_rb, src_ag); - - // Add two pixels into result. - __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha); _mm_store_si128(d, result); s++; d++; @@ -367,73 +307,24 @@ void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, count--; } __m128i *d = reinterpret_cast<__m128i*>(dst); - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); - __m128i c_256 = _mm_set1_epi16(256); - __m128i c_1 = _mm_set1_epi16(1); __m128i src_pixel = _mm_set1_epi32(color); while (count >= 4) { - // Load 4 pixels each of src and dest. + // Load 4 dst pixels __m128i dst_pixel = _mm_load_si128(d); - //set the aphla value - __m128i src_scale_wide = _mm_cvtsi32_si128(*reinterpret_cast(mask)); - src_scale_wide = _mm_unpacklo_epi8(src_scale_wide, - _mm_setzero_si128()); - src_scale_wide = _mm_unpacklo_epi16(src_scale_wide, src_scale_wide); + // Set the alpha value + __m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast(mask)); + alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128()); + alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128()); - //call SkAlpha255To256() - src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); - - // Get red and blue pixels into lower byte of each word. - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); - __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); - - // Get alpha and green into lower byte of each word. - __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); - __m128i src_ag = _mm_srli_epi16(src_pixel, 8); - - // Put per-pixel alpha in low byte of each word. - __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); - dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); - - // dst_alpha = dst_alpha * src_scale - dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); - - // Divide by 256. - dst_alpha = _mm_srli_epi16(dst_alpha, 8); - - // Subtract alphas from 256, to get 1..256 - dst_alpha = _mm_sub_epi16(c_256, dst_alpha); - // Multiply red and blue by dst pixel alpha. - dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); - // Multiply alpha and green by dst pixel alpha. - dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); - - // Multiply red and blue by global alpha. - src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); - // Multiply alpha and green by global alpha. - src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); - // Divide by 256. - dst_rb = _mm_srli_epi16(dst_rb, 8); - src_rb = _mm_srli_epi16(src_rb, 8); - - // Mask out low bits (goodies already in the right place; no need to divide) - dst_ag = _mm_andnot_si128(rb_mask, dst_ag); - src_ag = _mm_andnot_si128(rb_mask, src_ag); - - // Combine back into RGBA. - dst_pixel = _mm_or_si128(dst_rb, dst_ag); - __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); - - // Add two pixels into result. - __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); + __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_wide); _mm_store_si128(d, result); - // load the next 4 pixel + // Load the next 4 dst pixels and alphas mask = mask + 4; d++; count -= 4; } - dst = reinterpret_cast(d); + dst = reinterpret_cast(d); } while (count > 0) { *dst= SkBlendARGB32(color, *dst, *mask); diff --git a/src/opts/SkColor_opts_SSE2.h b/src/opts/SkColor_opts_SSE2.h index c1e41f02ae..c52fc1e876 100644 --- a/src/opts/SkColor_opts_SSE2.h +++ b/src/opts/SkColor_opts_SSE2.h @@ -203,5 +203,34 @@ static inline __m128i SkPixel32ToPixel16_ToU16_SSE2(const __m128i& src_pixel1, return d_pixel; } +// Portable version SkBlendARGB32 is in SkColorPriv.h. +static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst, + const __m128i& aa) { + __m128i src_scale = SkAlpha255To256_SSE2(aa); + // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale)) + __m128i dst_scale = SkGetPackedA32_SSE2(src); + dst_scale = _mm_mullo_epi16(dst_scale, src_scale); + dst_scale = _mm_srli_epi16(dst_scale, 8); + dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale); + + __m128i result = SkAlphaMulQ_SSE2(src, src_scale); + return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale)); +} + +// Fast path for SkBlendARGB32_SSE2 with a constant alpha factor. +static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst, + const unsigned aa) { + unsigned alpha = SkAlpha255To256(aa); + __m128i src_scale = _mm_set1_epi32(alpha); + // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale)) + __m128i dst_scale = SkGetPackedA32_SSE2(src); + dst_scale = _mm_mullo_epi16(dst_scale, src_scale); + dst_scale = _mm_srli_epi16(dst_scale, 8); + dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale); + + __m128i result = SkAlphaMulQ_SSE2(src, alpha); + return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale)); +} + #undef ASSERT_EQ #endif // SkColor_opts_SSE2_DEFINED