skia: blend32_16_row for neon version

This includes blend32_16_row neon implementation
for aarch32 and aarch64.

For performance,
blend32_16_row is called in following tests in nanobench.
 - Xfermode_SrcOver
 - tablebench
 - rotated_rects_bw_alternating_transparent_and_opaque_srcover
 - rotated_rects_bw_changing_transparent_srcover
 - rotated_rects_bw_same_transparent_srcover
 - luma_colorfilter_large
 - luma_colorfilter_small
 - chart_bw

I can see perf increase in following two tests, especially. For others, looks
similar.
For each, I tried to run two times.

1) Xfermode_SrcOver
<org>
 - D/skia    ( 2000):    3M        57      17.3µs  17.4µs  17.4µs  17.7µs  1%
  █▃▂▃▂▂▂▁▃▂      565     Xfermode_SrcOver
 - D/skia    ( 1915):    3M        70      13.5µs  16.9µs  16.7µs  18.8µs  9%
  ▆█▄▅█▁▅▅▆▄      565     Xfermode_SrcOver

<new>
 - D/skia    ( 2000):    3M        8       11.6µs  11.8µs  12.1µs  14.4µs  7%
  ▃█▁▁▂▁▁▁▂▂      565     Xfermode_SrcOver
 - D/skia    ( 2004):    3M        62      10.3µs  12.9µs  13µs    15.2µs  11%
  █▅▅▆▁▅▅▅▇▃      565     Xfermode_SrcOver

2)
luma_colorfilter_large
<org>
 - D/skia    ( 2000):  159M        8       136µs   136µs   136µs   139µs   1%
  █▃▁▂▁▁▁▁▁▁      565     luma_colorfilter_large
 - D/skia    ( 1915):  158M        2       135µs   177µs   182µs   269µs   22%
  ▆▃█▁▁▃▃▃▃▃      565     luma_colorfilter_large

<new>
 - D/skia    ( 2000):  157M        5       84.2µs  85.3µs  87.5µs  110µs   9%
  █▁▂▁▁▁▁▁▁▁      565     luma_colorfilter_large
 - D/skia    ( 2004):  159M        6       84.7µs  110µs   112µs   144µs   18%
  █▄▇▁▁▄▃▄▄▆      565     luma_colorfilter_large

Review URL: https://codereview.chromium.org/847363002
This commit is contained in:
mlee 2015-01-29 06:22:41 -08:00 committed by Commit bot
parent 9cc2f2613a
commit 402448d681
4 changed files with 157 additions and 23 deletions

View File

@ -77,6 +77,8 @@ protected:
uint16_t fRawDither16; // unscaled uint16_t fRawDither16; // unscaled
SkBool8 fDoDither; SkBool8 fDoDither;
SkBlitRow::ColorProc16 fColorProc16;
// illegal // illegal
SkRGB16_Blitter& operator=(const SkRGB16_Blitter&); SkRGB16_Blitter& operator=(const SkRGB16_Blitter&);
@ -544,6 +546,19 @@ SkRGB16_Blitter::SkRGB16_Blitter(const SkBitmap& device, const SkPaint& paint)
fColor16 = SkPackRGB16( SkAlphaMul(r, fScale) >> (8 - SK_R16_BITS), fColor16 = SkPackRGB16( SkAlphaMul(r, fScale) >> (8 - SK_R16_BITS),
SkAlphaMul(g, fScale) >> (8 - SK_G16_BITS), SkAlphaMul(g, fScale) >> (8 - SK_G16_BITS),
SkAlphaMul(b, fScale) >> (8 - SK_B16_BITS)); SkAlphaMul(b, fScale) >> (8 - SK_B16_BITS));
// compute SkBlitRow::Procs
unsigned flags = 0;
if (SkGetPackedA32(fSrcColor32) < 0xFF) {
flags |= SkBlitRow::kSrcPixelAlpha_Flag;
}
if (fDoDither) {
flags |= SkBlitRow::kDither_Flag;
}
fColorProc16 = SkBlitRow::ColorFactory16(flags);
} }
const SkBitmap* SkRGB16_Blitter::justAnOpaqueColor(uint32_t* value) { const SkBitmap* SkRGB16_Blitter::justAnOpaqueColor(uint32_t* value) {
@ -554,31 +569,12 @@ const SkBitmap* SkRGB16_Blitter::justAnOpaqueColor(uint32_t* value) {
return NULL; return NULL;
} }
static uint32_t pmcolor_to_expand16(SkPMColor c) {
unsigned r = SkGetPackedR32(c);
unsigned g = SkGetPackedG32(c);
unsigned b = SkGetPackedB32(c);
return (g << 24) | (r << 13) | (b << 2);
}
static inline void blend32_16_row(SkPMColor src, uint16_t dst[], int count) {
SkASSERT(count > 0);
uint32_t src_expand = pmcolor_to_expand16(src);
unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
do {
uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
*dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
dst += 1;
} while (--count != 0);
}
void SkRGB16_Blitter::blitH(int x, int y, int width) { void SkRGB16_Blitter::blitH(int x, int y, int width) {
SkASSERT(width > 0); SkASSERT(width > 0);
SkASSERT(x + width <= fDevice.width()); SkASSERT(x + width <= fDevice.width());
uint16_t* SK_RESTRICT device = fDevice.getAddr16(x, y); uint16_t* SK_RESTRICT device = fDevice.getAddr16(x, y);
// TODO: respect fDoDither fColorProc16(device, fSrcColor32, width, x, y);
blend32_16_row(fSrcColor32, device, width);
} }
void SkRGB16_Blitter::blitAntiH(int x, int y, void SkRGB16_Blitter::blitAntiH(int x, int y,
@ -681,10 +677,9 @@ void SkRGB16_Blitter::blitRect(int x, int y, int width, int height) {
SkASSERT(x + width <= fDevice.width() && y + height <= fDevice.height()); SkASSERT(x + width <= fDevice.width() && y + height <= fDevice.height());
uint16_t* SK_RESTRICT device = fDevice.getAddr16(x, y); uint16_t* SK_RESTRICT device = fDevice.getAddr16(x, y);
size_t deviceRB = fDevice.rowBytes(); size_t deviceRB = fDevice.rowBytes();
SkPMColor src32 = fSrcColor32;
while (--height >= 0) { while (--height >= 0) {
blend32_16_row(src32, device, width); fColorProc16(device, fSrcColor32, width, x, y);
device = (uint16_t*)((char*)device + deviceRB); device = (uint16_t*)((char*)device + deviceRB);
} }
} }

View File

@ -364,6 +364,13 @@ static const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm[] = {
NULL, // S32A_D565_Blend_Dither NULL, // S32A_D565_Blend_Dither
}; };
static const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm[] = {
NULL, // Color32_D565,
NULL, // Color32A_D565,
NULL, // Color32_D565_Dither,
NULL, // Color32A_D565_Dither
};
static const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = { static const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = {
NULL, // S32_Opaque, NULL, // S32_Opaque,
NULL, // S32_Blend, NULL, // S32_Blend,
@ -378,7 +385,7 @@ SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) {
} }
SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) { SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) {
return NULL; return SK_ARM_NEON_WRAP(sk_blitrow_platform_565_colorprocs_arm)[flags];
} }
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {

View File

@ -465,6 +465,130 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
} }
#endif // #ifdef SK_CPU_ARM32 #endif // #ifdef SK_CPU_ARM32
static uint32_t pmcolor_to_expand16(SkPMColor c) {
unsigned r = SkGetPackedR32(c);
unsigned g = SkGetPackedG32(c);
unsigned b = SkGetPackedB32(c);
return (g << 24) | (r << 13) | (b << 2);
}
void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
uint32_t src_expand;
unsigned scale;
uint16x8_t vmask_blue;
if (count <= 0) return;
SkASSERT(((size_t)dst & 0x01) == 0);
/*
* This preamble code is in order to make dst aligned to 8 bytes
* in the next mutiple bytes read & write access.
*/
src_expand = pmcolor_to_expand16(src);
scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
#define DST_ALIGN 8
/*
* preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
*/
int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
for (int i = 0; i < preamble_size; i+=2, dst++) {
uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
*dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
if (--count == 0)
break;
}
int count16 = 0;
count16 = count >> 4;
vmask_blue = vmovq_n_u16(SK_B16_MASK);
if (count16) {
uint16x8_t wide_sr;
uint16x8_t wide_sg;
uint16x8_t wide_sb;
uint16x8_t wide_256_sa;
unsigned sr = SkGetPackedR32(src);
unsigned sg = SkGetPackedG32(src);
unsigned sb = SkGetPackedB32(src);
unsigned sa = SkGetPackedA32(src);
// Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
// sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
//thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
// sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
//thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
// sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
//thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
wide_256_sa =
vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
while (count16-- > 0) {
uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
vdst1 = vld1q_u16(dst);
dst += 8;
vdst2 = vld1q_u16(dst);
dst -= 8; //to store dst again.
vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS); // shift green to top of lanes
vdst1_b = vdst1 & vmask_blue; // extract blue
vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT); // extract red
vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS); // shift green to top of lanes
vdst2_b = vdst2 & vmask_blue; // extract blue
vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT); // extract red
vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r); // sr + (256-sa) x dr1
vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g); // sg + (256-sa) x dg1
vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b); // sb + (256-sa) x db1
vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r); // sr + (256-sa) x dr2
vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g); // sg + (256-sa) x dg2
vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b); // sb + (256-sa) x db2
vdst1_r = vshrq_n_u16(vdst1_r, 5); // 5-bit right shift for 5-bit red
vdst1_g = vshrq_n_u16(vdst1_g, 5); // 5-bit right shift for 6-bit green
vdst1_b = vshrq_n_u16(vdst1_b, 5); // 5-bit right shift for 5-bit blue
vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT); // insert green into blue
vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT); // insert red into green/blue
vdst2_r = vshrq_n_u16(vdst2_r, 5); // 5-bit right shift for 5-bit red
vdst2_g = vshrq_n_u16(vdst2_g, 5); // 5-bit right shift for 6-bit green
vdst2_b = vshrq_n_u16(vdst2_b, 5); // 5-bit right shift for 5-bit blue
vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT); // insert green into blue
vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT); // insert red into green/blue
vst1q_u16(dst, vdst1);
dst += 8;
vst1q_u16(dst, vdst2);
dst += 8;
}
}
count &= 0xF;
if (count > 0) {
do {
uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
*dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
dst += 1;
} while (--count != 0);
}
}
static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
prod += vdupq_n_u16(128); prod += vdupq_n_u16(128);
prod += vshrq_n_u16(prod, 8); prod += vshrq_n_u16(prod, 8);
@ -1665,6 +1789,13 @@ const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
NULL, // S32A_D565_Blend_Dither NULL, // S32A_D565_Blend_Dither
}; };
const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
Color32A_D565_neon, // Color32_D565,
Color32A_D565_neon, // Color32A_D565,
Color32A_D565_neon, // Color32_D565_Dither,
Color32A_D565_neon, // Color32A_D565_Dither
};
const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
NULL, // S32_Opaque, NULL, // S32_Opaque,
S32_Blend_BlitRow32_neon, // S32_Blend, S32_Blend_BlitRow32_neon, // S32_Blend,

View File

@ -10,6 +10,7 @@
#include "SkBlitRow.h" #include "SkBlitRow.h"
extern const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[]; extern const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[];
extern const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[];
extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[]; extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[];
extern void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, extern void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,