ARM Skia NEON patches - 13 - S32A_Opaque

Blitrow32: S32A_Opaque code cleaning and speed improvement

- the old way of calculating alpha doesn't seem to be used anymore,
  so remove the remaining code
- adding prefetching allows to improve performance greatly in some
  cases at the expense of a little trade-off:

+-------+-----------+------------+
| count | Cortex-A9 | Cortex-A15 |
+-------+-----------+------------+
| 1,2   | 0         | 0          |
+-------+-----------+------------+
| 4     | 0         | -3%        |
+-------+-----------+------------+
| 8     | 0         | -4%        |
+-------+-----------+------------+
| 16    | 0         | -5%        |
+-------+-----------+------------+
| 64    | +14%      | 0          |
+-------+-----------+------------+
| 256   | +14%      | +12%       |
+-------+-----------+------------+
| 1024  | +115%     | +15%       |
+-------+-----------+------------+

BUG=
R=djsollen@google.com

Author: kevin.petit.arm@gmail.com

Review URL: https://chromiumcodereview.appspot.com/18459008

git-svn-id: http://skia.googlecode.com/svn/trunk@10026 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
commit-bot@chromium.org 2013-07-11 20:28:24 +00:00
parent 66ba9f978f
commit 0a5699ee48

View File

@ -426,6 +426,13 @@ void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
uint8x8_t src_raw, dst_raw, dst_final;
uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
/* The two prefetches below may make the code slighlty
* slower for small values of count but are worth having
* in the general case.
*/
__builtin_prefetch(src+32);
__builtin_prefetch(dst+32);
/* get the source */
src_raw = vreinterpret_u8_u32(vld1_u32(src));
#if UNROLL > 2
@ -447,14 +454,7 @@ void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
/* get the alphas spread out properly */
alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
#if 1
/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
/* we collapsed (255-a)+1 ... */
alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
#else
alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
#endif
/* spread the dest */
dst_wide = vmovl_u8(dst_raw);
@ -476,14 +476,7 @@ void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
uint16x8_t alpha_wide;
alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
#if 1
/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
/* we collapsed (255-a)+1 ... */
alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
#else
alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
#endif
/* spread the dest */
dst_wide = vmovl_u8(dst_raw_2);