Add SkBlendARGB32_SSE2() to clean up code
Related nanobench results: before: maxrss loops min median mean max stddev samples config bench 10M 2 31.9µs 32.4µs 33.3µs 38.7µs 6% █▄▂▂▂▁▂▁▁▁ 8888 bitmap_BGRA_8888_A_scale_bicubic 10M 13 43.8µs 51.8µs 49.6µs 57.9µs 11% ▁▁▁▁▂▆▇▆▅█ 8888 bitmap_BGRA_8888_A_scale_bilerp 10M 13 23.7µs 24.3µs 26µs 32.7µs 13% ▅█▆▁▁▁▁▂▁▁ 8888 bitmap_Index_8_A 10M 4 1.68µs 1.7µs 4.09µs 25.4µs 183% █▁▁▁▁▁▁▁▁▁ 8888 text_16_AA_88 10M 144 1.76µs 1.77µs 1.78µs 1.81µs 1% █▂▇▂▅▁▁▁▁▁ 8888 text_16_AA_FF 10M 10 4.7µs 5.34µs 5.61µs 8.63µs 21% █▂▂▃▂▁▁▁▁▄ 8888 rotated_rects_aa_alternating_transparent_and_opaque_src 10M 50 4.44µs 4.47µs 4.5µs 4.71µs 2% █▅▃▂▂▂▁▁▁▁ 8888 rotated_rects_aa_changing_opaque_src 10M 51 4.39µs 4.78µs 5.21µs 6.62µs 17% ▁▆▆▇▁▁█▁▂▂ 8888 rotated_rects_aa_same_opaque_src 10M 50 4.47µs 5.79µs 5.43µs 6.14µs 11% ▄▂▁▃▇▇▆▇▇█ 8888 rotated_rects_aa_alternating_transparent_and_opaque_srcover 10M 30 4.35µs 6.06µs 5.84µs 7.63µs 16% ▅▅▅▄▅▅▄█▁▁ 8888 rotated_rects_aa_changing_transparent_srcover 10M 44 4.31µs 4.51µs 4.76µs 6.25µs 13% ▄▂▂▁█▃▁▃▁▁ 8888 rotated_rects_aa_changing_opaque_srcover 10M 46 4.36µs 4.42µs 4.75µs 6.19µs 14% ▆█▃▁▁▁▁▁▁▁ 8888 rotated_rects_aa_same_transparent_srcover 10M 47 4.29µs 4.35µs 4.44µs 5.15µs 6% ▃▂▂▁▁█▁▁▁▁ 8888 rotated_rects_aa_same_opaque_srcover 10M 3 39.1µs 39.2µs 50.7µs 153µs 71% █▁▁▁▁▁▁▁▁▁ 8888 rectori 10M 1 2.3ms 2.31ms 2.35ms 2.74ms 6% ▁▁▁▁▁▁▁▁█▂ 8888 maskcolor 10M 1 2.33ms 2.34ms 2.53ms 3.14ms 11% ▁▁▁▁▁▁▅█▄▄ 8888 maskopaque 10M 11 15µs 15.3µs 15.7µs 18.3µs 7% ▅▃▂▂▁▁▁▁█▁ 8888 rrects_3_stroke_4 10M 46 3.99µs 4.07µs 4.14µs 4.54µs 4% █▅▅▃▂▂▁▁▁▁ 8888 rrects_3 10M 16 15.6µs 15.9µs 16.1µs 17.5µs 4% █▄▃▂▂▂▁▂▁▁ 8888 ovals_3_stroke_4 10M 40 5.09µs 5.18µs 5.23µs 5.67µs 3% █▅▃▂▂▁▃▁▁▁ 8888 ovals_3 10M 231 1.92µs 1.93µs 1.94µs 2µs 1% █▃▂▁▃▁▁▁▁▁ 8888 zeroradroundrect 10M 924 3.88µs 3.93µs 4.11µs 4.95µs 9% ▁█▆▃▁▁▁▁▁▁ 8888 arbroundrect 10M 8 8.11µs 8.47µs 8.48µs 8.85µs 3% █▅▇▄▄▂▁▄▄▆ 8888 merge_large 10M 14 6.71µs 6.92µs 6.96µs 7.46µs 3% ▃▆▁█▃▃▃▂▂▁ 8888 merge_small 11M 2 225µs 227µs 229µs 233µs 1% ███▃▇▂▃▁▃▂ 8888 displacement_full_large 16M 1 381µs 401µs 401µs 421µs 3% ▅▅▅█▆▄▄▃▃▁ 8888 displacement_alpha_large 19M 1 507µs 508µs 509µs 512µs 0% █▃▂▆▂▂▃▂▃▁ 8888 displacement_zero_large 19M 19 9µs 9.11µs 9.15µs 9.67µs 2% ▄▂▂▂█▂▁▁▁▂ 8888 displacement_full_small 19M 5 54.2µs 54.5µs 54.9µs 58µs 2% █▃▂▂▁▁▃▁▁▁ 8888 blurroundrect_WH[100x100]_cr[90] 20M 1 229µs 230µs 231µs 240µs 2% █▄▃▂▂▁▁▁▁▂ 8888 GM_varied_text_clipped_no_lcd 20M 1 267µs 269µs 270µs 279µs 1% █▄▃▂▂▂▂▂▁▁ 8888 GM_varied_text_ignorable_clip_no_lcd 22M 1 1.95ms 1.97ms 2.03ms 2.46ms 8% ▁▁▁▁▁▁▁▂█▃ 8888 GM_convex_poly_clip after: maxrss loops min median mean max stddev samples config bench 10M 2 31.5µs 32.3µs 32.8µs 37.2µs 5% █▄▃▂▂▂▁▁▁▁ 8888 bitmap_BGRA_8888_A_scale_bicubic 10M 13 43.9µs 44µs 44.1µs 44.9µs 1% █▂▁▁▁▆▁▁▁▂ 8888 bitmap_BGRA_8888_A_scale_bilerp 10M 19 22.7µs 23.3µs 25.6µs 32.4µs 14% ▁▁▁▁▁▅▆▁▅█ 8888 bitmap_Index_8_A 10M 5 1.79µs 1.97µs 3.85µs 21.1µs 158% █▁▁▁▁▁▁▁▁▁ 8888 text_16_AA_88 10M 141 1.83µs 1.83µs 1.85µs 1.93µs 2% ▅▁▁█▁▁▁▁▁▁ 8888 text_16_AA_FF 10M 10 4.65µs 4.92µs 5.06µs 6.56µs 11% █▃▃▂▂▂▁▁▁▁ 8888 rotated_rects_aa_alternating_transparent_and_opaque_src 10M 51 4.35µs 4.48µs 4.83µs 6.68µs 17% ▂▁▁▁▁▁▁▂▆█ 8888 rotated_rects_aa_changing_opaque_src 10M 51 4.38µs 4.79µs 4.85µs 5.84µs 11% ▁█▁▃▃▁▄▁▄▇ 8888 rotated_rects_aa_same_opaque_src 10M 32 5.58µs 6.24µs 6.1µs 6.39µs 5% █▂█▆▁▇▄▅▇▇ 8888 rotated_rects_aa_alternating_transparent_and_opaque_srcover 10M 42 4.28µs 5.59µs 5.11µs 6.01µs 15% ▂▂█▇█▂▁▆▁▇ 8888 rotated_rects_aa_changing_transparent_srcover 10M 48 4.24µs 4.33µs 4.58µs 6.46µs 15% ▁▁▁▁▁█▃▂▁▁ 8888 rotated_rects_aa_changing_opaque_srcover 10M 48 4.28µs 4.3µs 4.4µs 5.12µs 6% ▂▂▁▁▁▁▁▁▁█ 8888 rotated_rects_aa_same_transparent_srcover 10M 46 4.24µs 4.29µs 4.66µs 7.11µs 20% ▁▁▁▁▁▁▁▁▃█ 8888 rotated_rects_aa_same_opaque_srcover 10M 3 39.3µs 39.4µs 51.4µs 154µs 70% █▁▁▁▁▁▁▁▁▁ 8888 rectori 10M 1 2.32ms 2.43ms 2.53ms 3.14ms 11% ▁▁▁▁▂▄█▃▅▁ 8888 maskcolor 10M 1 2.33ms 2.37ms 2.54ms 3.21ms 12% ▁▁▁▁▁▂█▅▆▁ 8888 maskopaque 10M 10 15.3µs 15.6µs 15.8µs 17.2µs 4% █▅▃▂▂▂▁▁▁▁ 8888 rrects_3_stroke_4 10M 46 4.03µs 4.09µs 4.15µs 4.47µs 4% █▄▆▂▂▂▁▁▁▁ 8888 rrects_3 10M 15 15.9µs 16.2µs 16.3µs 17.8µs 4% █▄▃▂▂▂▁▁▁▁ 8888 ovals_3_stroke_4 10M 40 5.14µs 5.26µs 5.29µs 5.72µs 3% █▅▃▂▂▁▂▂▁▁ 8888 ovals_3 10M 222 1.91µs 1.99µs 2.21µs 2.91µs 19% ▂▁▁▁▁▁▂▇▇█ 8888 zeroradroundrect 10M 462 3.9µs 3.96µs 4.23µs 5.22µs 12% ▆▄█▁▂▁▁▁▁▁ 8888 arbroundrect 10M 8 8.2µs 8.59µs 8.62µs 8.97µs 3% ▆▄█▄▅▃▁▆▄█ 8888 merge_large 10M 14 6.73µs 6.88µs 6.86µs 7.08µs 2% ▄█▁▂▄▂▅▄▂▅ 8888 merge_small 11M 2 221µs 234µs 237µs 263µs 5% ▄▃▃▃▄▃▂▁▇█ 8888 displacement_full_large 16M 1 387µs 416µs 427µs 471µs 7% ▇█▁▃▃▁▃▃▇▆ 8888 displacement_alpha_large 19M 1 512µs 521µs 528µs 594µs 5% █▂▂▂▁▁▂▃▁▁ 8888 displacement_zero_large 19M 18 9.06µs 9.12µs 9.13µs 9.23µs 1% █▃▃▃▄▃▆▁▅▅ 8888 displacement_full_small 19M 5 55.6µs 55.9µs 56.5µs 59.5µs 2% █▃▂▁▁▁▁▁▅▁ 8888 blurroundrect_WH[100x100]_cr[90] 20M 1 229µs 233µs 235µs 254µs 3% █▄▃▂▂▁▁▂▁▁ 8888 GM_varied_text_clipped_no_lcd 20M 1 270µs 271µs 272µs 278µs 1% █▄▃▂▂▂▁▂▁▇ 8888 GM_varied_text_ignorable_clip_no_lcd 22M 1 1.96ms 2ms 2.06ms 2.45ms 7% ▂▂▁▁▁▁▁▃█▄ 8888 GM_convex_poly_clip BUG=skia: Review URL: https://codereview.chromium.org/754733002
This commit is contained in:
parent
551051c049
commit
2253aa9393
@ -207,74 +207,14 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
|
||||
count--;
|
||||
}
|
||||
|
||||
uint32_t src_scale = SkAlpha255To256(alpha);
|
||||
|
||||
const __m128i *s = reinterpret_cast<const __m128i*>(src);
|
||||
__m128i *d = reinterpret_cast<__m128i*>(dst);
|
||||
__m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
|
||||
__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
|
||||
__m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
|
||||
while (count >= 4) {
|
||||
// Load 4 pixels each of src and dest.
|
||||
__m128i src_pixel = _mm_loadu_si128(s);
|
||||
__m128i dst_pixel = _mm_load_si128(d);
|
||||
|
||||
// Get red and blue pixels into lower byte of each word.
|
||||
__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
|
||||
__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
|
||||
|
||||
// Get alpha and green into lower byte of each word.
|
||||
__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
|
||||
__m128i src_ag = _mm_srli_epi16(src_pixel, 8);
|
||||
|
||||
// Put per-pixel alpha in low byte of each word.
|
||||
// After the following two statements, the dst_alpha looks like
|
||||
// (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
|
||||
__m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
|
||||
dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
|
||||
|
||||
// dst_alpha = dst_alpha * src_scale
|
||||
// Because src_scales are in the higher byte of each word and
|
||||
// we use mulhi here, the resulting alpha values are already
|
||||
// in the right place and don't need to be divided by 256.
|
||||
// (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
|
||||
dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
|
||||
|
||||
// Subtract alphas from 256, to get 1..256
|
||||
dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
|
||||
|
||||
// Multiply red and blue by dst pixel alpha.
|
||||
dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
|
||||
// Multiply alpha and green by dst pixel alpha.
|
||||
dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
|
||||
|
||||
// Multiply red and blue by global alpha.
|
||||
// (4 x (0, rs.h, 0, bs.h))
|
||||
// where rs.h stands for the higher byte of r * src_scale,
|
||||
// and bs.h the higher byte of b * src_scale.
|
||||
// Again, because we use mulhi, the resuling red and blue
|
||||
// values are already in the right place and don't need to
|
||||
// be divided by 256.
|
||||
src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
|
||||
// Multiply alpha and green by global alpha.
|
||||
// (4 x (0, as.h, 0, gs.h))
|
||||
src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
|
||||
|
||||
// Divide by 256.
|
||||
dst_rb = _mm_srli_epi16(dst_rb, 8);
|
||||
|
||||
// Mask out low bits (goodies already in the right place; no need to divide)
|
||||
dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
|
||||
// Shift alpha and green to higher byte of each word.
|
||||
// (4 x (as.h, 0, gs.h, 0))
|
||||
src_ag = _mm_slli_epi16(src_ag, 8);
|
||||
|
||||
// Combine back into RGBA.
|
||||
dst_pixel = _mm_or_si128(dst_rb, dst_ag);
|
||||
src_pixel = _mm_or_si128(src_rb, src_ag);
|
||||
|
||||
// Add two pixels into result.
|
||||
__m128i result = _mm_add_epi8(src_pixel, dst_pixel);
|
||||
__m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
|
||||
_mm_store_si128(d, result);
|
||||
s++;
|
||||
d++;
|
||||
@ -367,73 +307,24 @@ void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
|
||||
count--;
|
||||
}
|
||||
__m128i *d = reinterpret_cast<__m128i*>(dst);
|
||||
__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
|
||||
__m128i c_256 = _mm_set1_epi16(256);
|
||||
__m128i c_1 = _mm_set1_epi16(1);
|
||||
__m128i src_pixel = _mm_set1_epi32(color);
|
||||
while (count >= 4) {
|
||||
// Load 4 pixels each of src and dest.
|
||||
// Load 4 dst pixels
|
||||
__m128i dst_pixel = _mm_load_si128(d);
|
||||
|
||||
//set the aphla value
|
||||
__m128i src_scale_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask));
|
||||
src_scale_wide = _mm_unpacklo_epi8(src_scale_wide,
|
||||
_mm_setzero_si128());
|
||||
src_scale_wide = _mm_unpacklo_epi16(src_scale_wide, src_scale_wide);
|
||||
// Set the alpha value
|
||||
__m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask));
|
||||
alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128());
|
||||
alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128());
|
||||
|
||||
//call SkAlpha255To256()
|
||||
src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
|
||||
|
||||
// Get red and blue pixels into lower byte of each word.
|
||||
__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
|
||||
__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
|
||||
|
||||
// Get alpha and green into lower byte of each word.
|
||||
__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
|
||||
__m128i src_ag = _mm_srli_epi16(src_pixel, 8);
|
||||
|
||||
// Put per-pixel alpha in low byte of each word.
|
||||
__m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
|
||||
dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
|
||||
|
||||
// dst_alpha = dst_alpha * src_scale
|
||||
dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
|
||||
|
||||
// Divide by 256.
|
||||
dst_alpha = _mm_srli_epi16(dst_alpha, 8);
|
||||
|
||||
// Subtract alphas from 256, to get 1..256
|
||||
dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
|
||||
// Multiply red and blue by dst pixel alpha.
|
||||
dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
|
||||
// Multiply alpha and green by dst pixel alpha.
|
||||
dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
|
||||
|
||||
// Multiply red and blue by global alpha.
|
||||
src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
|
||||
// Multiply alpha and green by global alpha.
|
||||
src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
|
||||
// Divide by 256.
|
||||
dst_rb = _mm_srli_epi16(dst_rb, 8);
|
||||
src_rb = _mm_srli_epi16(src_rb, 8);
|
||||
|
||||
// Mask out low bits (goodies already in the right place; no need to divide)
|
||||
dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
|
||||
src_ag = _mm_andnot_si128(rb_mask, src_ag);
|
||||
|
||||
// Combine back into RGBA.
|
||||
dst_pixel = _mm_or_si128(dst_rb, dst_ag);
|
||||
__m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
|
||||
|
||||
// Add two pixels into result.
|
||||
__m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
|
||||
__m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_wide);
|
||||
_mm_store_si128(d, result);
|
||||
// load the next 4 pixel
|
||||
// Load the next 4 dst pixels and alphas
|
||||
mask = mask + 4;
|
||||
d++;
|
||||
count -= 4;
|
||||
}
|
||||
dst = reinterpret_cast<SkPMColor *>(d);
|
||||
dst = reinterpret_cast<SkPMColor*>(d);
|
||||
}
|
||||
while (count > 0) {
|
||||
*dst= SkBlendARGB32(color, *dst, *mask);
|
||||
|
@ -203,5 +203,34 @@ static inline __m128i SkPixel32ToPixel16_ToU16_SSE2(const __m128i& src_pixel1,
|
||||
return d_pixel;
|
||||
}
|
||||
|
||||
// Portable version SkBlendARGB32 is in SkColorPriv.h.
|
||||
static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
|
||||
const __m128i& aa) {
|
||||
__m128i src_scale = SkAlpha255To256_SSE2(aa);
|
||||
// SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
|
||||
__m128i dst_scale = SkGetPackedA32_SSE2(src);
|
||||
dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
|
||||
dst_scale = _mm_srli_epi16(dst_scale, 8);
|
||||
dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
|
||||
|
||||
__m128i result = SkAlphaMulQ_SSE2(src, src_scale);
|
||||
return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
|
||||
}
|
||||
|
||||
// Fast path for SkBlendARGB32_SSE2 with a constant alpha factor.
|
||||
static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
|
||||
const unsigned aa) {
|
||||
unsigned alpha = SkAlpha255To256(aa);
|
||||
__m128i src_scale = _mm_set1_epi32(alpha);
|
||||
// SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
|
||||
__m128i dst_scale = SkGetPackedA32_SSE2(src);
|
||||
dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
|
||||
dst_scale = _mm_srli_epi16(dst_scale, 8);
|
||||
dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
|
||||
|
||||
__m128i result = SkAlphaMulQ_SSE2(src, alpha);
|
||||
return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
|
||||
}
|
||||
|
||||
#undef ASSERT_EQ
|
||||
#endif // SkColor_opts_SSE2_DEFINED
|
||||
|
Loading…
Reference in New Issue
Block a user