Unroll loops in SkBlurMask for speedup on Windows (benchmarks should see

15% on interpolated blurs, 5-10% on simple blurs). git-svn-id: http://skia.googlecode.com/svn/trunk@2755 2bbb7eff-a529-9590-31e7-b0007b416f81
2011-11-28 18:22:01 +00:00 · 2011-11-28 18:22:01 +00:00 · 01224d5d0a
commit 01224d5d0a
parent 5e12770cb0
4 changed files with 133 additions and 18 deletions
--- a/include/core/SkEndian.h
+++ b/include/core/SkEndian.h
@ -80,6 +80,19 @@ static inline void SkEndianSwap32s(uint32_t array[], int count) {
    #define SkEndian_SwapLE32(n)    SkEndianSwap32(n)
 #endif

+// When a bytestream is embedded in a 32-bit word, how far we need to
+// shift the word to extract each byte from the low 8 bits by anding with 0xff.
+#ifdef SK_CPU_LENDIAN
+    #define SkEndian_Byte0Shift 0
+    #define SkEndian_Byte1Shift 8
+    #define SkEndian_Byte2Shift 16
+    #define SkEndian_Byte3Shift 24
+#else   // SK_CPU_BENDIAN
+    #define SkEndian_Byte0Shift 24
+    #define SkEndian_Byte1Shift 16
+    #define SkEndian_Byte2Shift 8
+    #define SkEndian_Byte3Shift 0
+#endif

 #endif

--- a/include/core/SkTypes.h
+++ b/include/core/SkTypes.h
@ -216,6 +216,8 @@ static inline bool SkIsU16(long x) {
 */
 #define SkAlign4(x)     (((x) + 3) >> 2 << 2)

+#define SkIsAlign4(x) (((x) & 3) == 0)
+
 typedef uint32_t SkFourByteTag;
 #define SkSetFourByteTag(a, b, c, d)    (((a) << 24) | ((b) << 16) | ((c) << 8) | (d))

--- a/include/gpu/GrTypes.h
+++ b/include/gpu/GrTypes.h
@ -55,7 +55,7 @@
 *  n is already a multiple of 4
 */
 #define GrALIGN4(n)     SkAlign4(n)
-#define GrIsALIGN4(n)   (((n) & 3) == 0)
+#define GrIsALIGN4(n)   SkIsAlign4(n)

 template <typename T> const T& GrMin(const T& a, const T& b) {
 	return (a < b) ? a : b;
--- a/src/effects/SkBlurMask.cpp
+++ b/src/effects/SkBlurMask.cpp
@ -10,6 +10,15 @@
 #include "SkBlurMask.h"
 #include "SkMath.h"
 #include "SkTemplates.h"
+#include "SkEndian.h"
+
+// Unrolling the integer blur kernel seems to give us a ~15% speedup on Windows,
+// breakeven on Mac, and ~15% slowdown on Linux.
+// Reading a word at a time when bulding the sum buffer seems to give
+// us no appreciable speedup on Windows or Mac, and 2% slowdown on Linux.
+#if defined(BUILD_FOR_WIN_32)
+#define UNROLL_KERNEL_LOOP 1
+#endif

 /** The sum buffer is an array of u32 to hold the accumulated sum of all of the
    src values at their position, plus all values above and to the left.
@ -49,7 +58,39 @@ static void build_sum_buffer(uint32_t sum[], int srcW, int srcH,
        uint32_t L = 0;
        uint32_t C = 0;
        *sum++ = 0; // initialze the first column to 0
-        for (x = srcW - 1; x >= 0; --x) {
+
+        for (x = srcW - 1; !SkIsAlign4((intptr_t) src) && x >= 0; x--) {
+            uint32_t T = sum[-sumW];
+            X = *src++ + L + T - C;
+            *sum++ = X;
+            L = X;
+            C = T;
+        }
+
+        for (; x >= 4; x-=4) {
+            uint32_t T = sum[-sumW];
+            X = *src++ + L + T - C;
+            *sum++ = X;
+            L = X;
+            C = T;
+            T = sum[-sumW];
+            X = *src++ + L + T - C;
+            *sum++ = X;
+            L = X;
+            C = T;
+            T = sum[-sumW];
+            X = *src++ + L + T - C;
+            *sum++ = X;
+            L = X;
+            C = T;
+            T = sum[-sumW];
+            X = *src++ + L + T - C;
+            *sum++ = X;
+            L = X;
+            C = T;
+        }
+
+        for (; x >= 0; --x) {
            uint32_t T = sum[-sumW];
            X = *src++ + L + T - C;
            *sum++ = X;
@ -86,8 +127,6 @@ static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[],
        int next_x = 1;

        for (int x = 0; x < dw; x++) {
-            //int px = SkClampPos(prev_x);
-            //int nx = SkFastMin32(next_x, sw);
            int px = SkClampPos(prev_x);
            int nx = SkFastMin32(next_x, sw);

@ -120,6 +159,12 @@ static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[],
            prev_x += 1;
            next_x += 1;
        }
+ *  The sections are:
+ *     left-hand section, where prev_x is clamped to 0
+ *     center section, where neither prev_x nor next_x is clamped
+ *     right-hand section, where next_x is clamped to sw
+ *  On some operating systems, the center section is unrolled for additional
+ *  speedup.
 */
 static void apply_kernel(uint8_t dst[], int rx, int ry, const uint32_t sum[],
                         int sw, int sh) {
@ -162,14 +207,35 @@ static void apply_kernel(uint8_t dst[], int rx, int ry, const uint32_t sum[],
            next_x += 1;
        }

+        int i0 = prev_x + py;
+        int i1 = next_x + ny;
+        int i2 = next_x + py;
+        int i3 = prev_x + ny;
+
+#if UNROLL_KERNEL_LOOP
+        for (; x < dw - 2*rx - 4; x += 4) {
+            SkASSERT(prev_x >= 0);
+            SkASSERT(next_x <= sw);
+
+            uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
+            *dst++ = SkToU8(tmp * scale >> 24);
+            tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
+            *dst++ = SkToU8(tmp * scale >> 24);
+            tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
+            *dst++ = SkToU8(tmp * scale >> 24);
+            tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
+            *dst++ = SkToU8(tmp * scale >> 24);
+
+            prev_x += 4;
+            next_x += 4;
+        }
+#endif
+
        for (; x < dw - 2*rx; x++) {
            SkASSERT(prev_x >= 0);
            SkASSERT(next_x <= sw);

-            int px = prev_x;
-            int nx = next_x;
-
-            uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];
+            uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
            *dst++ = SkToU8(tmp * scale >> 24);

            prev_x += 1;
@ -277,6 +343,12 @@ static void kernel_interp_clamped(uint8_t dst[], int rx, int ry,
            prev_x += 1;
            next_x += 1;
        }
+ *  The sections are:
+ *     left-hand section, where prev_x is clamped to 0
+ *     center section, where neither prev_x nor next_x is clamped
+ *     right-hand section, where next_x is clamped to sw
+ *  On some operating systems, the center section is unrolled for additional
+ *  speedup.
 */
 static void apply_kernel_interp(uint8_t dst[], int rx, int ry,
                const uint32_t sum[], int sw, int sh, U8CPU outer_weight) {
@ -339,20 +411,48 @@ static void apply_kernel_interp(uint8_t dst[], int rx, int ry,
            next_x += 1;
        }

+        int i0 = prev_x + py;
+        int i1 = next_x + ny;
+        int i2 = next_x + py;
+        int i3 = prev_x + ny;
+        int i4 = prev_x + 1 + ipy;
+        int i5 = next_x - 1 + iny;
+        int i6 = next_x - 1 + ipy;
+        int i7 = prev_x + 1 + iny;
+
+#if UNROLL_KERNEL_LOOP
+        for (; x < dw - 2*rx - 4; x += 4) {
+            SkASSERT(prev_x >= 0);
+            SkASSERT(next_x <= sw);
+
+            uint32_t outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
+            uint32_t inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
+            *dst++ = SkToU8((outer_sum * outer_scale
+                           + inner_sum * inner_scale) >> 24);
+            outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
+            inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
+            *dst++ = SkToU8((outer_sum * outer_scale
+                           + inner_sum * inner_scale) >> 24);
+            outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
+            inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
+            *dst++ = SkToU8((outer_sum * outer_scale
+                           + inner_sum * inner_scale) >> 24);
+            outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
+            inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
+            *dst++ = SkToU8((outer_sum * outer_scale
+                           + inner_sum * inner_scale) >> 24);
+
+            prev_x += 4;
+            next_x += 4;
+        }
+#endif
+
        for (; x < dw - 2*rx; x++) {
            SkASSERT(prev_x >= 0);
            SkASSERT(next_x <= sw);

-            int px = prev_x;
-            int nx = next_x;
-
-            int ipx = prev_x + 1;
-            int inx = next_x - 1;
-
-            uint32_t outer_sum = sum[px+py] + sum[nx+ny]
-                               - sum[nx+py] - sum[px+ny];
-            uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny]
-                               - sum[inx+ipy] - sum[ipx+iny];
+            uint32_t outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
+            uint32_t inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
            *dst++ = SkToU8((outer_sum * outer_scale
                           + inner_sum * inner_scale) >> 24);