ARM Skia NEON patches - 13 - S32A_Opaque

Blitrow32: S32A_Opaque code cleaning and speed improvement - the old way of calculating alpha doesn't seem to be used anymore, so remove the remaining code - adding prefetching allows to improve performance greatly in some cases at the expense of a little trade-off: +-------+-----------+------------+ | count | Cortex-A9 | Cortex-A15 | +-------+-----------+------------+ | 1,2 | 0 | 0 | +-------+-----------+------------+ | 4 | 0 | -3% | +-------+-----------+------------+ | 8 | 0 | -4% | +-------+-----------+------------+ | 16 | 0 | -5% | +-------+-----------+------------+ | 64 | +14% | 0 | +-------+-----------+------------+ | 256 | +14% | +12% | +-------+-----------+------------+ | 1024 | +115% | +15% | +-------+-----------+------------+ BUG= R=djsollen@google.com Author: kevin.petit.arm@gmail.com Review URL: https://chromiumcodereview.appspot.com/18459008 git-svn-id: http://skia.googlecode.com/svn/trunk@10026 2bbb7eff-a529-9590-31e7-b0007b416f81
2013-07-11 20:28:24 +00:00 · 2013-07-11 20:28:24 +00:00 · 0a5699ee48
commit 0a5699ee48
parent 66ba9f978f
1 changed files with 7 additions and 14 deletions
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@ -426,6 +426,13 @@ void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
        uint8x8_t src_raw, dst_raw, dst_final;
        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;

+        /* The two prefetches below may make the code slighlty
+         * slower for small values of count but are worth having
+         * in the general case.
+         */
+        __builtin_prefetch(src+32);
+        __builtin_prefetch(dst+32);
+
        /* get the source */
        src_raw = vreinterpret_u8_u32(vld1_u32(src));
 #if    UNROLL > 2
@ -447,14 +454,7 @@ void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

        /* get the alphas spread out properly */
        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
-#if 1
-        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
-        /* we collapsed (255-a)+1 ... */
        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
-#else
-        alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
-        alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
-#endif

        /* spread the dest */
        dst_wide = vmovl_u8(dst_raw);
@ -476,14 +476,7 @@ void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
        uint16x8_t alpha_wide;

        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
-#if 1
-        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
-        /* we collapsed (255-a)+1 ... */
        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
-#else
-        alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
-        alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
-#endif

        /* spread the dest */
        dst_wide = vmovl_u8(dst_raw_2);