ARM Skia NEON patches - 16/17 - Blitmask
Blitmask: NEON optimised version of the D32_A8 functions Here are the microbenchmark results I got for the D32_A8 functions: Cortex-A9: ========== +-------+--------+--------+--------+ | count | Black | Opaque | Color | +-------+--------+--------+--------+ | 1 | -14% | -39,5% | -37,5% | +-------+--------+--------+--------+ | 2 | -3% | -29,9% | -25% | +-------+--------+--------+--------+ | 4 | -11,3% | -22% | -14,5% | +-------+--------+--------+--------+ | 8 | +128% | +66,6% | +105% | +-------+--------+--------+--------+ | 16 | +159% | +102% | +149% | +-------+--------+--------+--------+ | 64 | +189% | +136% | +189% | +-------+--------+--------+--------+ | 256 | +126% | +102% | +149% | +-------+--------+--------+--------+ | 1024 | +67,5% | +81,4% | +123% | +-------+--------+--------+--------+ Cortex-A15: =========== +-------+--------+--------+--------+ | count | Black | Opaque | Color | +-------+--------+--------+--------+ | 1 | -24% | -46,5% | -37,5% | +-------+--------+--------+--------+ | 2 | -18,5% | -35,5% | -28% | +-------+--------+--------+--------+ | 4 | -5,2% | -17,5% | -15,5% | +-------+--------+--------+--------+ | 8 | +72% | +65,8% | +84,7% | +-------+--------+--------+--------+ | 16 | +168% | +117% | +149% | +-------+--------+--------+--------+ | 64 | +165% | +110% | +145% | +-------+--------+--------+--------+ | 256 | +106% | +99,6% | +141% | +-------+--------+--------+--------+ | 1024 | +93,7% | +94,7% | +130% | +-------+--------+--------+--------+ Blitmask: add NEON optimised PlatformBlitRowProcs16 Here are the microbenchmark results (speedup vs. C code): +-------+-----------------+-----------------+ | | Cortex-A9 | Cortex-A15 | | count +--------+--------+--------+--------+ | | Blend | Opaque | Blend | Opaque | +-------+--------+--------+--------+--------+ | 1 | -19,2% | -36,7% | -33,6% | -44,7% | +-------+--------+--------+--------+--------+ | 2 | -12,6% | -27,8% | -39% | -48% | +-------+--------+--------+--------+--------+ | 4 | -11,5% | -21,6% | -37,7% | -44,3% | +-------+--------+--------+--------+--------+ | 8 | +141% | +59,7% | +123% | +48,7% | +-------+--------+--------+--------+--------+ | 16 | +213% | +119% | +214% | +121% | +-------+--------+--------+--------+--------+ | 64 | +212% | +105% | +242% | +167% | +-------+--------+--------+--------+--------+ | 256 | +289% | +167% | +249% | +207% | +-------+--------+--------+--------+--------+ | 1024 | +273% | +169% | +146% | +220% | +-------+--------+--------+--------+--------+ Signed-off-by: Kévin PETIT <kevin.petit@arm.com> BUG= R=djsollen@google.com, mtklein@google.com, reed@google.com Author: kevin.petit.arm@gmail.com Review URL: https://codereview.chromium.org/23719002 git-svn-id: http://skia.googlecode.com/svn/trunk@12420 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
parent
a9a4b04a98
commit
dbe7f52412
@ -177,6 +177,7 @@
|
||||
'../src/opts/SkBitmapProcState_matrixProcs_neon.cpp',
|
||||
'../src/opts/SkBitmapProcState_matrix_clamp_neon.h',
|
||||
'../src/opts/SkBitmapProcState_matrix_repeat_neon.h',
|
||||
'../src/opts/SkBlitMask_opts_arm_neon.cpp',
|
||||
'../src/opts/SkBlitRow_opts_arm_neon.cpp',
|
||||
'../src/opts/SkMorphology_opts_neon.cpp',
|
||||
'../src/opts/SkXfermode_opts_arm_neon.cpp',
|
||||
|
@ -1,14 +1,39 @@
|
||||
|
||||
#include "SkColor.h"
|
||||
#include "SkColorPriv.h"
|
||||
#include "SkBlitMask.h"
|
||||
#include "SkUtilsArm.h"
|
||||
#include "SkBlitMask_opts_arm_neon.h"
|
||||
|
||||
SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
|
||||
SkMask::Format maskFormat,
|
||||
SkColor color) {
|
||||
#if SK_ARM_NEON_IS_NONE
|
||||
return NULL;
|
||||
#else
|
||||
#if SK_ARM_NEON_IS_DYNAMIC
|
||||
if (!sk_cpu_arm_has_neon()) {
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
if ((SkBitmap::kARGB_8888_Config == dstConfig) &&
|
||||
(SkMask::kA8_Format == maskFormat)) {
|
||||
return D32_A8_Factory_neon(color);
|
||||
}
|
||||
#endif
|
||||
|
||||
// We don't need to handle the SkMask::kLCD16_Format case as the default
|
||||
// LCD16 will call us through SkBlitMask::PlatformBlitRowProcs16()
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
|
||||
return NULL;
|
||||
if (isOpaque) {
|
||||
return SK_ARM_NEON_WRAP(SkBlitLCD16OpaqueRow);
|
||||
} else {
|
||||
return SK_ARM_NEON_WRAP(SkBlitLCD16Row);
|
||||
}
|
||||
}
|
||||
|
||||
SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
|
||||
|
255
src/opts/SkBlitMask_opts_arm_neon.cpp
Normal file
255
src/opts/SkBlitMask_opts_arm_neon.cpp
Normal file
@ -0,0 +1,255 @@
|
||||
|
||||
#include "SkBlitMask.h"
|
||||
#include "SkColor_opts_neon.h"
|
||||
|
||||
static void D32_A8_Black_neon(void* SK_RESTRICT dst, size_t dstRB,
|
||||
const void* SK_RESTRICT maskPtr, size_t maskRB,
|
||||
SkColor, int width, int height) {
|
||||
SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
|
||||
const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
|
||||
|
||||
maskRB -= width;
|
||||
dstRB -= (width << 2);
|
||||
do {
|
||||
int w = width;
|
||||
while (w >= 8) {
|
||||
uint8x8_t vmask = vld1_u8(mask);
|
||||
uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask);
|
||||
uint8x8x4_t vdevice = vld4_u8((uint8_t*)device);
|
||||
|
||||
vdevice = SkAlphaMulQ_neon8(vdevice, vscale);
|
||||
vdevice.val[NEON_A] += vmask;
|
||||
|
||||
vst4_u8((uint8_t*)device, vdevice);
|
||||
|
||||
mask += 8;
|
||||
device += 8;
|
||||
w -= 8;
|
||||
}
|
||||
while (w-- > 0) {
|
||||
unsigned aa = *mask++;
|
||||
*device = (aa << SK_A32_SHIFT)
|
||||
+ SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
|
||||
device += 1;
|
||||
};
|
||||
device = (uint32_t*)((char*)device + dstRB);
|
||||
mask += maskRB;
|
||||
} while (--height != 0);
|
||||
}
|
||||
|
||||
template <bool isColor>
|
||||
static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB,
|
||||
const void* SK_RESTRICT maskPtr, size_t maskRB,
|
||||
SkColor color, int width, int height) {
|
||||
SkPMColor pmc = SkPreMultiplyColor(color);
|
||||
SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
|
||||
const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
|
||||
uint8x8x4_t vpmc;
|
||||
|
||||
maskRB -= width;
|
||||
dstRB -= (width << 2);
|
||||
|
||||
if (width >= 8) {
|
||||
vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc));
|
||||
vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc));
|
||||
vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc));
|
||||
vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc));
|
||||
}
|
||||
do {
|
||||
int w = width;
|
||||
while (w >= 8) {
|
||||
uint8x8_t vmask = vld1_u8(mask);
|
||||
uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask);
|
||||
if (isColor) {
|
||||
vscale = vsubw_u8(vdupq_n_u16(256),
|
||||
SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256));
|
||||
} else {
|
||||
vscale = vsubw_u8(vdupq_n_u16(256), vmask);
|
||||
}
|
||||
uint8x8x4_t vdev = vld4_u8((uint8_t*)device);
|
||||
|
||||
vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)
|
||||
+ SkAlphaMul_neon8(vdev.val[NEON_A], vscale);
|
||||
vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256)
|
||||
+ SkAlphaMul_neon8(vdev.val[NEON_R], vscale);
|
||||
vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256)
|
||||
+ SkAlphaMul_neon8(vdev.val[NEON_G], vscale);
|
||||
vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256)
|
||||
+ SkAlphaMul_neon8(vdev.val[NEON_B], vscale);
|
||||
|
||||
vst4_u8((uint8_t*)device, vdev);
|
||||
|
||||
mask += 8;
|
||||
device += 8;
|
||||
w -= 8;
|
||||
}
|
||||
|
||||
while (w--) {
|
||||
unsigned aa = *mask++;
|
||||
if (isColor) {
|
||||
*device = SkBlendARGB32(pmc, *device, aa);
|
||||
} else {
|
||||
*device = SkAlphaMulQ(pmc, SkAlpha255To256(aa))
|
||||
+ SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
|
||||
}
|
||||
device += 1;
|
||||
};
|
||||
|
||||
device = (uint32_t*)((char*)device + dstRB);
|
||||
mask += maskRB;
|
||||
|
||||
} while (--height != 0);
|
||||
}
|
||||
|
||||
static void D32_A8_Opaque_neon(void* SK_RESTRICT dst, size_t dstRB,
|
||||
const void* SK_RESTRICT maskPtr, size_t maskRB,
|
||||
SkColor color, int width, int height) {
|
||||
D32_A8_Opaque_Color_neon<false>(dst, dstRB, maskPtr, maskRB, color, width, height);
|
||||
}
|
||||
|
||||
static void D32_A8_Color_neon(void* SK_RESTRICT dst, size_t dstRB,
|
||||
const void* SK_RESTRICT maskPtr, size_t maskRB,
|
||||
SkColor color, int width, int height) {
|
||||
D32_A8_Opaque_Color_neon<true>(dst, dstRB, maskPtr, maskRB, color, width, height);
|
||||
}
|
||||
|
||||
SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color) {
|
||||
if (SK_ColorBLACK == color) {
|
||||
return D32_A8_Black_neon;
|
||||
} else if (0xFF == SkColorGetA(color)) {
|
||||
return D32_A8_Opaque_neon;
|
||||
} else {
|
||||
return D32_A8_Color_neon;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width,
|
||||
SkPMColor opaqueDst) {
|
||||
int colR = SkColorGetR(color);
|
||||
int colG = SkColorGetG(color);
|
||||
int colB = SkColorGetB(color);
|
||||
|
||||
uint8x8_t vcolR, vcolG, vcolB;
|
||||
uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB;
|
||||
|
||||
if (width >= 8) {
|
||||
vcolR = vdup_n_u8(colR);
|
||||
vcolG = vdup_n_u8(colG);
|
||||
vcolB = vdup_n_u8(colB);
|
||||
vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
|
||||
vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
|
||||
vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
|
||||
vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
|
||||
}
|
||||
|
||||
while (width >= 8) {
|
||||
uint8x8x4_t vdst;
|
||||
uint16x8_t vmask;
|
||||
uint16x8_t vmaskR, vmaskG, vmaskB;
|
||||
uint8x8_t vsel_trans, vsel_opq;
|
||||
|
||||
vdst = vld4_u8((uint8_t*)dst);
|
||||
vmask = vld1q_u16(src);
|
||||
|
||||
// Prepare compare masks
|
||||
vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
|
||||
vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
|
||||
|
||||
// Get all the color masks on 5 bits
|
||||
vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
|
||||
vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
|
||||
SK_B16_BITS + SK_R16_BITS + 1);
|
||||
vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
|
||||
|
||||
// Upscale to 0..32
|
||||
vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
|
||||
vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
|
||||
vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
|
||||
|
||||
vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
|
||||
vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
|
||||
|
||||
vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
|
||||
vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
|
||||
vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
|
||||
|
||||
vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
|
||||
vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
|
||||
vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
|
||||
|
||||
vst4_u8((uint8_t*)dst, vdst);
|
||||
|
||||
dst += 8;
|
||||
src += 8;
|
||||
width -= 8;
|
||||
}
|
||||
|
||||
// Leftovers
|
||||
for (int i = 0; i < width; i++) {
|
||||
dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
|
||||
opaqueDst);
|
||||
}
|
||||
}
|
||||
|
||||
void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width, SkPMColor) {
|
||||
int colA = SkColorGetA(color);
|
||||
int colR = SkColorGetR(color);
|
||||
int colG = SkColorGetG(color);
|
||||
int colB = SkColorGetB(color);
|
||||
|
||||
colA = SkAlpha255To256(colA);
|
||||
|
||||
uint8x8_t vcolR, vcolG, vcolB;
|
||||
uint16x8_t vcolA;
|
||||
|
||||
if (width >= 8) {
|
||||
vcolA = vdupq_n_u16(colA);
|
||||
vcolR = vdup_n_u8(colR);
|
||||
vcolG = vdup_n_u8(colG);
|
||||
vcolB = vdup_n_u8(colB);
|
||||
}
|
||||
|
||||
while (width >= 8) {
|
||||
uint8x8x4_t vdst;
|
||||
uint16x8_t vmask;
|
||||
uint16x8_t vmaskR, vmaskG, vmaskB;
|
||||
|
||||
vdst = vld4_u8((uint8_t*)dst);
|
||||
vmask = vld1q_u16(src);
|
||||
|
||||
// Get all the color masks on 5 bits
|
||||
vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
|
||||
vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
|
||||
SK_B16_BITS + SK_R16_BITS + 1);
|
||||
vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
|
||||
|
||||
// Upscale to 0..32
|
||||
vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
|
||||
vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
|
||||
vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
|
||||
|
||||
vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
|
||||
vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
|
||||
vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
|
||||
|
||||
vdst.val[NEON_A] = vdup_n_u8(0xFF);
|
||||
vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
|
||||
vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
|
||||
vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
|
||||
|
||||
vst4_u8((uint8_t*)dst, vdst);
|
||||
|
||||
dst += 8;
|
||||
src += 8;
|
||||
width -= 8;
|
||||
}
|
||||
|
||||
for (int i = 0; i < width; i++) {
|
||||
dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
|
||||
}
|
||||
}
|
||||
|
16
src/opts/SkBlitMask_opts_arm_neon.h
Normal file
16
src/opts/SkBlitMask_opts_arm_neon.h
Normal file
@ -0,0 +1,16 @@
|
||||
#ifndef SkBlitMask_opts_arm_neon_DEFINED
|
||||
#define SkBlitMask_opts_arm_neon_DEFINED
|
||||
|
||||
#include "SkColor.h"
|
||||
#include "SkBlitMask.h"
|
||||
|
||||
extern SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color);
|
||||
|
||||
extern void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width,
|
||||
SkPMColor opaqueDst);
|
||||
|
||||
extern void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
|
||||
SkColor color, int width, SkPMColor);
|
||||
|
||||
#endif // #ifndef SkBlitMask_opts_arm_neon_DEFINED
|
@ -2,6 +2,7 @@
|
||||
#define SkColor_opts_neon_DEFINED
|
||||
|
||||
#include "SkTypes.h"
|
||||
#include "SkColorPriv.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
@ -65,4 +66,20 @@ static inline uint16x8_t SkPixel32ToPixel16_neon8(uint8x8x4_t vsrc) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* This function blends 8 pixels of the same channel in the exact same way as
|
||||
* SkBlend32.
|
||||
*/
|
||||
static inline uint8x8_t SkBlend32_neon8(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) {
|
||||
int16x8_t src_wide, dst_wide;
|
||||
|
||||
src_wide = vreinterpretq_s16_u16(vmovl_u8(src));
|
||||
dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst));
|
||||
|
||||
src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale);
|
||||
|
||||
dst_wide += vshrq_n_s16(src_wide, 5);
|
||||
|
||||
return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
|
||||
}
|
||||
|
||||
#endif /* #ifndef SkColor_opts_neon_DEFINED */
|
||||
|
Loading…
Reference in New Issue
Block a user