Unroll loops in SkBlurMask for speedup on Windows (benchmarks should see
15% on interpolated blurs, 5-10% on simple blurs). git-svn-id: http://skia.googlecode.com/svn/trunk@2755 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
parent
5e12770cb0
commit
01224d5d0a
@ -80,6 +80,19 @@ static inline void SkEndianSwap32s(uint32_t array[], int count) {
|
||||
#define SkEndian_SwapLE32(n) SkEndianSwap32(n)
|
||||
#endif
|
||||
|
||||
// When a bytestream is embedded in a 32-bit word, how far we need to
|
||||
// shift the word to extract each byte from the low 8 bits by anding with 0xff.
|
||||
#ifdef SK_CPU_LENDIAN
|
||||
#define SkEndian_Byte0Shift 0
|
||||
#define SkEndian_Byte1Shift 8
|
||||
#define SkEndian_Byte2Shift 16
|
||||
#define SkEndian_Byte3Shift 24
|
||||
#else // SK_CPU_BENDIAN
|
||||
#define SkEndian_Byte0Shift 24
|
||||
#define SkEndian_Byte1Shift 16
|
||||
#define SkEndian_Byte2Shift 8
|
||||
#define SkEndian_Byte3Shift 0
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -216,6 +216,8 @@ static inline bool SkIsU16(long x) {
|
||||
*/
|
||||
#define SkAlign4(x) (((x) + 3) >> 2 << 2)
|
||||
|
||||
#define SkIsAlign4(x) (((x) & 3) == 0)
|
||||
|
||||
typedef uint32_t SkFourByteTag;
|
||||
#define SkSetFourByteTag(a, b, c, d) (((a) << 24) | ((b) << 16) | ((c) << 8) | (d))
|
||||
|
||||
|
@ -55,7 +55,7 @@
|
||||
* n is already a multiple of 4
|
||||
*/
|
||||
#define GrALIGN4(n) SkAlign4(n)
|
||||
#define GrIsALIGN4(n) (((n) & 3) == 0)
|
||||
#define GrIsALIGN4(n) SkIsAlign4(n)
|
||||
|
||||
template <typename T> const T& GrMin(const T& a, const T& b) {
|
||||
return (a < b) ? a : b;
|
||||
|
@ -10,6 +10,15 @@
|
||||
#include "SkBlurMask.h"
|
||||
#include "SkMath.h"
|
||||
#include "SkTemplates.h"
|
||||
#include "SkEndian.h"
|
||||
|
||||
// Unrolling the integer blur kernel seems to give us a ~15% speedup on Windows,
|
||||
// breakeven on Mac, and ~15% slowdown on Linux.
|
||||
// Reading a word at a time when bulding the sum buffer seems to give
|
||||
// us no appreciable speedup on Windows or Mac, and 2% slowdown on Linux.
|
||||
#if defined(BUILD_FOR_WIN_32)
|
||||
#define UNROLL_KERNEL_LOOP 1
|
||||
#endif
|
||||
|
||||
/** The sum buffer is an array of u32 to hold the accumulated sum of all of the
|
||||
src values at their position, plus all values above and to the left.
|
||||
@ -49,7 +58,39 @@ static void build_sum_buffer(uint32_t sum[], int srcW, int srcH,
|
||||
uint32_t L = 0;
|
||||
uint32_t C = 0;
|
||||
*sum++ = 0; // initialze the first column to 0
|
||||
for (x = srcW - 1; x >= 0; --x) {
|
||||
|
||||
for (x = srcW - 1; !SkIsAlign4((intptr_t) src) && x >= 0; x--) {
|
||||
uint32_t T = sum[-sumW];
|
||||
X = *src++ + L + T - C;
|
||||
*sum++ = X;
|
||||
L = X;
|
||||
C = T;
|
||||
}
|
||||
|
||||
for (; x >= 4; x-=4) {
|
||||
uint32_t T = sum[-sumW];
|
||||
X = *src++ + L + T - C;
|
||||
*sum++ = X;
|
||||
L = X;
|
||||
C = T;
|
||||
T = sum[-sumW];
|
||||
X = *src++ + L + T - C;
|
||||
*sum++ = X;
|
||||
L = X;
|
||||
C = T;
|
||||
T = sum[-sumW];
|
||||
X = *src++ + L + T - C;
|
||||
*sum++ = X;
|
||||
L = X;
|
||||
C = T;
|
||||
T = sum[-sumW];
|
||||
X = *src++ + L + T - C;
|
||||
*sum++ = X;
|
||||
L = X;
|
||||
C = T;
|
||||
}
|
||||
|
||||
for (; x >= 0; --x) {
|
||||
uint32_t T = sum[-sumW];
|
||||
X = *src++ + L + T - C;
|
||||
*sum++ = X;
|
||||
@ -86,8 +127,6 @@ static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[],
|
||||
int next_x = 1;
|
||||
|
||||
for (int x = 0; x < dw; x++) {
|
||||
//int px = SkClampPos(prev_x);
|
||||
//int nx = SkFastMin32(next_x, sw);
|
||||
int px = SkClampPos(prev_x);
|
||||
int nx = SkFastMin32(next_x, sw);
|
||||
|
||||
@ -120,6 +159,12 @@ static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[],
|
||||
prev_x += 1;
|
||||
next_x += 1;
|
||||
}
|
||||
* The sections are:
|
||||
* left-hand section, where prev_x is clamped to 0
|
||||
* center section, where neither prev_x nor next_x is clamped
|
||||
* right-hand section, where next_x is clamped to sw
|
||||
* On some operating systems, the center section is unrolled for additional
|
||||
* speedup.
|
||||
*/
|
||||
static void apply_kernel(uint8_t dst[], int rx, int ry, const uint32_t sum[],
|
||||
int sw, int sh) {
|
||||
@ -162,14 +207,35 @@ static void apply_kernel(uint8_t dst[], int rx, int ry, const uint32_t sum[],
|
||||
next_x += 1;
|
||||
}
|
||||
|
||||
int i0 = prev_x + py;
|
||||
int i1 = next_x + ny;
|
||||
int i2 = next_x + py;
|
||||
int i3 = prev_x + ny;
|
||||
|
||||
#if UNROLL_KERNEL_LOOP
|
||||
for (; x < dw - 2*rx - 4; x += 4) {
|
||||
SkASSERT(prev_x >= 0);
|
||||
SkASSERT(next_x <= sw);
|
||||
|
||||
uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
|
||||
*dst++ = SkToU8(tmp * scale >> 24);
|
||||
tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
|
||||
*dst++ = SkToU8(tmp * scale >> 24);
|
||||
tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
|
||||
*dst++ = SkToU8(tmp * scale >> 24);
|
||||
tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
|
||||
*dst++ = SkToU8(tmp * scale >> 24);
|
||||
|
||||
prev_x += 4;
|
||||
next_x += 4;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; x < dw - 2*rx; x++) {
|
||||
SkASSERT(prev_x >= 0);
|
||||
SkASSERT(next_x <= sw);
|
||||
|
||||
int px = prev_x;
|
||||
int nx = next_x;
|
||||
|
||||
uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];
|
||||
uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
|
||||
*dst++ = SkToU8(tmp * scale >> 24);
|
||||
|
||||
prev_x += 1;
|
||||
@ -277,6 +343,12 @@ static void kernel_interp_clamped(uint8_t dst[], int rx, int ry,
|
||||
prev_x += 1;
|
||||
next_x += 1;
|
||||
}
|
||||
* The sections are:
|
||||
* left-hand section, where prev_x is clamped to 0
|
||||
* center section, where neither prev_x nor next_x is clamped
|
||||
* right-hand section, where next_x is clamped to sw
|
||||
* On some operating systems, the center section is unrolled for additional
|
||||
* speedup.
|
||||
*/
|
||||
static void apply_kernel_interp(uint8_t dst[], int rx, int ry,
|
||||
const uint32_t sum[], int sw, int sh, U8CPU outer_weight) {
|
||||
@ -339,20 +411,48 @@ static void apply_kernel_interp(uint8_t dst[], int rx, int ry,
|
||||
next_x += 1;
|
||||
}
|
||||
|
||||
int i0 = prev_x + py;
|
||||
int i1 = next_x + ny;
|
||||
int i2 = next_x + py;
|
||||
int i3 = prev_x + ny;
|
||||
int i4 = prev_x + 1 + ipy;
|
||||
int i5 = next_x - 1 + iny;
|
||||
int i6 = next_x - 1 + ipy;
|
||||
int i7 = prev_x + 1 + iny;
|
||||
|
||||
#if UNROLL_KERNEL_LOOP
|
||||
for (; x < dw - 2*rx - 4; x += 4) {
|
||||
SkASSERT(prev_x >= 0);
|
||||
SkASSERT(next_x <= sw);
|
||||
|
||||
uint32_t outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
|
||||
uint32_t inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
|
||||
*dst++ = SkToU8((outer_sum * outer_scale
|
||||
+ inner_sum * inner_scale) >> 24);
|
||||
outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
|
||||
inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
|
||||
*dst++ = SkToU8((outer_sum * outer_scale
|
||||
+ inner_sum * inner_scale) >> 24);
|
||||
outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
|
||||
inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
|
||||
*dst++ = SkToU8((outer_sum * outer_scale
|
||||
+ inner_sum * inner_scale) >> 24);
|
||||
outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
|
||||
inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
|
||||
*dst++ = SkToU8((outer_sum * outer_scale
|
||||
+ inner_sum * inner_scale) >> 24);
|
||||
|
||||
prev_x += 4;
|
||||
next_x += 4;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; x < dw - 2*rx; x++) {
|
||||
SkASSERT(prev_x >= 0);
|
||||
SkASSERT(next_x <= sw);
|
||||
|
||||
int px = prev_x;
|
||||
int nx = next_x;
|
||||
|
||||
int ipx = prev_x + 1;
|
||||
int inx = next_x - 1;
|
||||
|
||||
uint32_t outer_sum = sum[px+py] + sum[nx+ny]
|
||||
- sum[nx+py] - sum[px+ny];
|
||||
uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny]
|
||||
- sum[inx+ipy] - sum[ipx+iny];
|
||||
uint32_t outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
|
||||
uint32_t inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
|
||||
*dst++ = SkToU8((outer_sum * outer_scale
|
||||
+ inner_sum * inner_scale) >> 24);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user