Implementing Color32 functions for Neon platforms.

Besides the raw processing improvement provided by Neon, the code uses memory preteches (pld) which seem to improve performance greatly when dealing with very large counts. This was tested using bench where color32 accounts for the majority of the workload: bench -match rects_1 -config 8888 -repeat 500 -forceBlend 1 (the forceBlend is there so that the Color32 code does not go through the special cases where alpha == 0xFF as it would transform color32 into a sk_memset32. Numbers averaged over 3 runs: bench name | Before | Neon, no pld | Neon with pld | full boost rrects_1 | 153.9 | 128.3 | 92 | 1.66x rects_1_stroke_4| 32.8 | 31.4 | 28.45 | 1.15x rects_1 | 125.35 | 97.2 | 63.59 | 1.97x Credits: various googletv team members. Committed on behalf of evannier. Review URL: http://codereview.appspot.com/5569077/ git-svn-id: http://skia.googlecode.com/svn/trunk@4779 2bbb7eff-a529-9590-31e7-b0007b416f81
2012-07-26 14:20:13 +00:00 · 2012-07-26 14:20:13 +00:00 · 84d6715153
commit 84d6715153
parent eb9568a13d
2 changed files with 145 additions and 6 deletions
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@ -1,15 +1,18 @@
 /*
- * Copyright 2009 The Android Open Source Project
+ * Copyright 2012 The Android Open Source Project
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */


-#include "SkBlitRow.h"
 #include "SkBlitMask.h"
+#include "SkBlitRow.h"
 #include "SkColorPriv.h"
 #include "SkDither.h"
+#include "SkUtils.h"
+
+#include "SkCachePreload_arm.h"

 #if defined(__ARM_HAVE_NEON)
 #include <arm_neon.h>
@ -1256,6 +1259,105 @@ static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
 #define	S32_D565_Opaque_Dither_PROC NULL
 #endif

+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+static void Color32_neon(SkPMColor* dst, const SkPMColor* src, int count,
+                         SkPMColor color) {
+    if (count <= 0) {
+        return;
+    }
+
+    if (0 == color) {
+        if (src != dst) {
+            memcpy(dst, src, count * sizeof(SkPMColor));
+        }
+        return;
+    }
+
+    unsigned colorA = SkGetPackedA32(color);
+    if (255 == colorA) {
+        sk_memset32(dst, color, count);
+    } else {
+        unsigned scale = 256 - SkAlpha255To256(colorA);
+
+        if (count >= 8) {
+            // at the end of this assembly, count will have been decremented
+            // to a negative value. That is, if count mod 8 = x, it will be
+            // -8 +x coming out.
+            asm volatile (
+                PLD128(src, 0)
+
+                "vdup.32    q0, %[color]                \n\t"
+
+                PLD128(src, 128)
+
+                // scale numerical interval [0-255], so load as 8 bits
+                "vdup.8     d2, %[scale]                \n\t"
+
+                PLD128(src, 256)
+
+                "subs       %[count], %[count], #8      \n\t"
+
+                PLD128(src, 384)
+
+                "Loop_Color32:                          \n\t"
+
+                // load src color, 8 pixels, 4 64 bit registers
+                // (and increment src).
+                "vld1.32    {d4-d7}, [%[src]]!          \n\t"
+
+                PLD128(src, 384)
+
+                // multiply long by scale, 64 bits at a time,
+                // destination into a 128 bit register.
+                "vmull.u8   q4, d4, d2                  \n\t"
+                "vmull.u8   q5, d5, d2                  \n\t"
+                "vmull.u8   q6, d6, d2                  \n\t"
+                "vmull.u8   q7, d7, d2                  \n\t"
+
+                // shift the 128 bit registers, containing the 16
+                // bit scaled values back to 8 bits, narrowing the
+                // results to 64 bit registers.
+                "vshrn.i16  d8, q4, #8                  \n\t"
+                "vshrn.i16  d9, q5, #8                  \n\t"
+                "vshrn.i16  d10, q6, #8                 \n\t"
+                "vshrn.i16  d11, q7, #8                 \n\t"
+
+                // adding back the color, using 128 bit registers.
+                "vadd.i8    q6, q4, q0                  \n\t"
+                "vadd.i8    q7, q5, q0                  \n\t"
+
+                // store back the 8 calculated pixels (2 128 bit
+                // registers), and increment dst.
+                "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
+
+                "subs       %[count], %[count], #8      \n\t"
+                "bge        Loop_Color32                \n\t"
+                : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
+                : [color] "r" (color), [scale] "r" (scale)
+                : "cc", "memory",
+                  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+                  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
+                          );
+            // At this point, if we went through the inline assembly, count is
+            // a negative value:
+            // if the value is -8, there is no pixel left to process.
+            // if the value is -7, there is one pixel left to process
+            // ...
+            // And'ing it with 7 will give us the number of pixels
+            // left to process.
+            count = count & 0x7;
+        }
+
+        while (count > 0) {
+            *dst = color + SkAlphaMulQ(*src, scale);
+            src += 1;
+            dst += 1;
+            count--;
+        }
+    }
+}
+#endif
+
 ///////////////////////////////////////////////////////////////////////////////

 static const SkBlitRow::Proc platform_565_procs[] = {
@ -1305,11 +1407,14 @@ SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
    return platform_32_procs[flags];
 }

-SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
-    return NULL;
-}
-
 ///////////////////////////////////////////////////////////////////////////////
+SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+    return Color32_neon;
+#else
+    return NULL;
+#endif
+}

 SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
                                                     SkMask::Format maskFormat,
--- a/src/opts/SkCachePreload_arm.h
+++ b/src/opts/SkCachePreload_arm.h
@ -0,0 +1,34 @@
+/*
+ * Copyright 2012 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#ifndef SkCachePreload_arm_DEFINED
+#define SkCachePreload_arm_DEFINED
+
+// This file defines macros for preload instructions for ARM. These macros
+// are designed to be embedded inside GNU inline assembly.
+// For the use of these macros, __ARM_USE_PLD needs to be enabled. The cache
+// line size also needs to be known (and needs to be contained inside
+// __ARM_CACHE_LINE_SIZE).
+#if defined(__ARM_USE_PLD)
+
+#define PLD(x, n)           "pld        [%["#x"], #("#n")]\n\t"
+
+#if __ARM_CACHE_LINE_SIZE == 32
+    #define PLD64(x, n)      PLD(x, n) PLD(x, (n) + 32)
+#elif __ARM_CACHE_LINE_SIZE == 64
+    #define PLD64(x, n)      PLD(x, n)
+#else
+    #error "unknown __ARM_CACHE_LINE_SIZE."
+#endif
+#else
+    // PLD is disabled, all macros become empty.
+    #define PLD(x, n)
+    #define PLD64(x, n)
+#endif
+
+#define PLD128(x, n)         PLD64(x, n) PLD64(x, (n) + 64)
+
+#endif  // SkCachePreload_arm_DEFINED