Unroll loops in SkBlurMask for speedup on Windows (benchmarks should see

15% on interpolated blurs, 5-10% on simple blurs).



git-svn-id: http://skia.googlecode.com/svn/trunk@2755 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
tomhudson@google.com 2011-11-28 18:22:01 +00:00
parent 5e12770cb0
commit 01224d5d0a
4 changed files with 133 additions and 18 deletions

View File

@ -80,6 +80,19 @@ static inline void SkEndianSwap32s(uint32_t array[], int count) {
#define SkEndian_SwapLE32(n) SkEndianSwap32(n)
#endif
// When a bytestream is embedded in a 32-bit word, how far we need to
// shift the word to extract each byte from the low 8 bits by anding with 0xff.
#ifdef SK_CPU_LENDIAN
#define SkEndian_Byte0Shift 0
#define SkEndian_Byte1Shift 8
#define SkEndian_Byte2Shift 16
#define SkEndian_Byte3Shift 24
#else // SK_CPU_BENDIAN
#define SkEndian_Byte0Shift 24
#define SkEndian_Byte1Shift 16
#define SkEndian_Byte2Shift 8
#define SkEndian_Byte3Shift 0
#endif
#endif

View File

@ -216,6 +216,8 @@ static inline bool SkIsU16(long x) {
*/
#define SkAlign4(x) (((x) + 3) >> 2 << 2)
#define SkIsAlign4(x) (((x) & 3) == 0)
typedef uint32_t SkFourByteTag;
#define SkSetFourByteTag(a, b, c, d) (((a) << 24) | ((b) << 16) | ((c) << 8) | (d))

View File

@ -55,7 +55,7 @@
* n is already a multiple of 4
*/
#define GrALIGN4(n) SkAlign4(n)
#define GrIsALIGN4(n) (((n) & 3) == 0)
#define GrIsALIGN4(n) SkIsAlign4(n)
template <typename T> const T& GrMin(const T& a, const T& b) {
return (a < b) ? a : b;

View File

@ -10,6 +10,15 @@
#include "SkBlurMask.h"
#include "SkMath.h"
#include "SkTemplates.h"
#include "SkEndian.h"
// Unrolling the integer blur kernel seems to give us a ~15% speedup on Windows,
// breakeven on Mac, and ~15% slowdown on Linux.
// Reading a word at a time when bulding the sum buffer seems to give
// us no appreciable speedup on Windows or Mac, and 2% slowdown on Linux.
#if defined(BUILD_FOR_WIN_32)
#define UNROLL_KERNEL_LOOP 1
#endif
/** The sum buffer is an array of u32 to hold the accumulated sum of all of the
src values at their position, plus all values above and to the left.
@ -49,7 +58,39 @@ static void build_sum_buffer(uint32_t sum[], int srcW, int srcH,
uint32_t L = 0;
uint32_t C = 0;
*sum++ = 0; // initialze the first column to 0
for (x = srcW - 1; x >= 0; --x) {
for (x = srcW - 1; !SkIsAlign4((intptr_t) src) && x >= 0; x--) {
uint32_t T = sum[-sumW];
X = *src++ + L + T - C;
*sum++ = X;
L = X;
C = T;
}
for (; x >= 4; x-=4) {
uint32_t T = sum[-sumW];
X = *src++ + L + T - C;
*sum++ = X;
L = X;
C = T;
T = sum[-sumW];
X = *src++ + L + T - C;
*sum++ = X;
L = X;
C = T;
T = sum[-sumW];
X = *src++ + L + T - C;
*sum++ = X;
L = X;
C = T;
T = sum[-sumW];
X = *src++ + L + T - C;
*sum++ = X;
L = X;
C = T;
}
for (; x >= 0; --x) {
uint32_t T = sum[-sumW];
X = *src++ + L + T - C;
*sum++ = X;
@ -86,8 +127,6 @@ static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[],
int next_x = 1;
for (int x = 0; x < dw; x++) {
//int px = SkClampPos(prev_x);
//int nx = SkFastMin32(next_x, sw);
int px = SkClampPos(prev_x);
int nx = SkFastMin32(next_x, sw);
@ -120,6 +159,12 @@ static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[],
prev_x += 1;
next_x += 1;
}
* The sections are:
* left-hand section, where prev_x is clamped to 0
* center section, where neither prev_x nor next_x is clamped
* right-hand section, where next_x is clamped to sw
* On some operating systems, the center section is unrolled for additional
* speedup.
*/
static void apply_kernel(uint8_t dst[], int rx, int ry, const uint32_t sum[],
int sw, int sh) {
@ -162,14 +207,35 @@ static void apply_kernel(uint8_t dst[], int rx, int ry, const uint32_t sum[],
next_x += 1;
}
int i0 = prev_x + py;
int i1 = next_x + ny;
int i2 = next_x + py;
int i3 = prev_x + ny;
#if UNROLL_KERNEL_LOOP
for (; x < dw - 2*rx - 4; x += 4) {
SkASSERT(prev_x >= 0);
SkASSERT(next_x <= sw);
uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
*dst++ = SkToU8(tmp * scale >> 24);
tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
*dst++ = SkToU8(tmp * scale >> 24);
tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
*dst++ = SkToU8(tmp * scale >> 24);
tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
*dst++ = SkToU8(tmp * scale >> 24);
prev_x += 4;
next_x += 4;
}
#endif
for (; x < dw - 2*rx; x++) {
SkASSERT(prev_x >= 0);
SkASSERT(next_x <= sw);
int px = prev_x;
int nx = next_x;
uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];
uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
*dst++ = SkToU8(tmp * scale >> 24);
prev_x += 1;
@ -277,6 +343,12 @@ static void kernel_interp_clamped(uint8_t dst[], int rx, int ry,
prev_x += 1;
next_x += 1;
}
* The sections are:
* left-hand section, where prev_x is clamped to 0
* center section, where neither prev_x nor next_x is clamped
* right-hand section, where next_x is clamped to sw
* On some operating systems, the center section is unrolled for additional
* speedup.
*/
static void apply_kernel_interp(uint8_t dst[], int rx, int ry,
const uint32_t sum[], int sw, int sh, U8CPU outer_weight) {
@ -339,20 +411,48 @@ static void apply_kernel_interp(uint8_t dst[], int rx, int ry,
next_x += 1;
}
int i0 = prev_x + py;
int i1 = next_x + ny;
int i2 = next_x + py;
int i3 = prev_x + ny;
int i4 = prev_x + 1 + ipy;
int i5 = next_x - 1 + iny;
int i6 = next_x - 1 + ipy;
int i7 = prev_x + 1 + iny;
#if UNROLL_KERNEL_LOOP
for (; x < dw - 2*rx - 4; x += 4) {
SkASSERT(prev_x >= 0);
SkASSERT(next_x <= sw);
uint32_t outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
uint32_t inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
*dst++ = SkToU8((outer_sum * outer_scale
+ inner_sum * inner_scale) >> 24);
outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
*dst++ = SkToU8((outer_sum * outer_scale
+ inner_sum * inner_scale) >> 24);
outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
*dst++ = SkToU8((outer_sum * outer_scale
+ inner_sum * inner_scale) >> 24);
outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
*dst++ = SkToU8((outer_sum * outer_scale
+ inner_sum * inner_scale) >> 24);
prev_x += 4;
next_x += 4;
}
#endif
for (; x < dw - 2*rx; x++) {
SkASSERT(prev_x >= 0);
SkASSERT(next_x <= sw);
int px = prev_x;
int nx = next_x;
int ipx = prev_x + 1;
int inx = next_x - 1;
uint32_t outer_sum = sum[px+py] + sum[nx+ny]
- sum[nx+py] - sum[px+ny];
uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny]
- sum[inx+ipy] - sum[ipx+iny];
uint32_t outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
uint32_t inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
*dst++ = SkToU8((outer_sum * outer_scale
+ inner_sum * inner_scale) >> 24);