ARM Skia NEON patches - 39 - arm64 565 blitters
This enables all 565 blitters except S32A_D565_Opaque. Here are some performance results: S32_D565_Opaque: ================ +-------+------------+------------+ | count | Cortex-A53 | Cortex-A57 | +-------+------------+------------+ | 1 | -18.37% | -13.04% | +-------+------------+------------+ | 2 | -9.90% | -13.78% | +-------+------------+------------+ | 4 | -8.28% | -6.77% | +-------+------------+------------+ | 8 | 157.63% | 78.15% | +-------+------------+------------+ | 16 | 72.67% | 44.81% | +-------+------------+------------+ | 64 | 76.78% | 40.89% | +-------+------------+------------+ | 256 | 73.85% | 36.05% | +-------+------------+------------+ | 1024 | 75.73% | 36.70% | +-------+------------+------------+ S32_D565_Blend: =============== +-------+------------+------------+ | count | Cortex-A53 | Cortex-A57 | +-------+------------+------------+ | 1 | -9.99% | -13.79% | +-------+------------+------------+ | 2 | -9.17% | -6.74% | +-------+------------+------------+ | 4 | -6.73% | -4.42% | +-------+------------+------------+ | 8 | 163.31% | 112.82% | +-------+------------+------------+ | 16 | 55.21% | 44.68% | +-------+------------+------------+ | 64 | 54.09% | 41.99% | +-------+------------+------------+ | 256 | 52.63% | 40.64% | +-------+------------+------------+ | 1024 | 52.46% | 40.45% | +-------+------------+------------+ S32A_D565_Blend: ================ +-------+------------+------------+ | count | Cortex-A53 | Cortex-A57 | +-------+------------+------------+ | 1 | -5.88% | -6.06% | +-------+------------+------------+ | 2 | -4.74% | -0.01% | +-------+------------+------------+ | 4 | -5.42% | -3.03% | +-------+------------+------------+ | 8 | 78.78% | 77.96% | +-------+------------+------------+ | 16 | 98.19% | 79.61% | +-------+------------+------------+ | 64 | 111.56% | 72.60% | +-------+------------+------------+ | 256 | 113.80% | 69.96% | +-------+------------+------------+ | 1024 | 114.42% | 70.85% | +-------+------------+------------+ S32_D565_Opaque_Dither: ======================= +-------+------------+------------+ | count | Cortex-A53 | Cortex-A57 | +-------+------------+------------+ | 1 | -4.18% | -0.93% | +-------+------------+------------+ | 2 | -2.43% | -2.04% | +-------+------------+------------+ | 4 | -1.09% | -1.23% | +-------+------------+------------+ | 8 | 184.89% | 136.53% | +-------+------------+------------+ | 16 | 128.64% | 89.11% | +-------+------------+------------+ | 64 | 132.68% | 100.98% | +-------+------------+------------+ | 256 | 157.02% | 100.86% | +-------+------------+------------+ | 1024 | 163.85% | 103.62% | +-------+------------+------------+ S32_D565_Blend_Dither: ====================== +-------+------------+------------+ | count | Cortex-A53 | Cortex-A57 | +-------+------------+------------+ | 1 | -4.87% | 0.01% | +-------+------------+------------+ | 2 | -2.71% | 2.97% | +-------+------------+------------+ | 4 | -2.20% | 0.28% | +-------+------------+------------+ | 8 | 149.76% | 146.80% | +-------+------------+------------+ | 16 | 85.69% | 95.77% | +-------+------------+------------+ | 64 | 88.81% | 101.39% | +-------+------------+------------+ | 256 | 97.32% | 107.22% | +-------+------------+------------+ | 1024 | 98.08% | 115.71% | +-------+------------+------------+ S32A_D565_Opaque_Dither: ======================== +-------+------------+------------+ | count | Cortex-A53 | Cortex-A57 | +-------+------------+------------+ | 1 | -1.86% | 0.02% | +-------+------------+------------+ | 2 | -0.58% | -1.52% | +-------+------------+------------+ | 4 | -0.75% | 1.16% | +-------+------------+------------+ | 8 | 240.74% | 155.16% | +-------+------------+------------+ | 16 | 181.97% | 132.15% | +-------+------------+------------+ | 64 | 203.11% | 136.48% | +-------+------------+------------+ | 256 | 223.45% | 133.05% | +-------+------------+------------+ | 1024 | 225.96% | 134.05% | +-------+------------+------------+ Signed-off-by: Kévin PETIT <kevin.petit@arm.com> BUG=skia: R=djsollen@google.com, mtklein@google.com Author: kevin.petit@arm.com Review URL: https://codereview.chromium.org/317193003
This commit is contained in:
parent
2e44b51b9a
commit
ea6b46b6c0
@ -47,3 +47,8 @@ tilemode_bitmap
|
||||
tilemodes_npot
|
||||
tilemodes
|
||||
|
||||
# Added by kevin.petit@arm.com for https://codereview.chromium.org/317193003
|
||||
# Only S32A_D565_Blend is causing mismatches
|
||||
shadertext3
|
||||
drawbitmaprect
|
||||
coloremoji
|
||||
|
@ -17,7 +17,53 @@
|
||||
#include "SkColor_opts_neon.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
#ifdef SK_CPU_ARM32
|
||||
#ifdef SK_CPU_ARM64
|
||||
static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
|
||||
uint8x8x4_t vsrc;
|
||||
uint8x8_t vsrc_0, vsrc_1, vsrc_2;
|
||||
|
||||
asm (
|
||||
"ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
|
||||
"mov %[vsrc0].8b, v0.8b \t\n"
|
||||
"mov %[vsrc1].8b, v1.8b \t\n"
|
||||
"mov %[vsrc2].8b, v2.8b \t\n"
|
||||
: [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
|
||||
[vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
|
||||
: : "v0", "v1", "v2", "v3"
|
||||
);
|
||||
|
||||
vsrc.val[0] = vsrc_0;
|
||||
vsrc.val[1] = vsrc_1;
|
||||
vsrc.val[2] = vsrc_2;
|
||||
|
||||
return vsrc;
|
||||
}
|
||||
|
||||
static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
|
||||
uint8x8x4_t vsrc;
|
||||
uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
|
||||
|
||||
asm (
|
||||
"ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
|
||||
"mov %[vsrc0].8b, v0.8b \t\n"
|
||||
"mov %[vsrc1].8b, v1.8b \t\n"
|
||||
"mov %[vsrc2].8b, v2.8b \t\n"
|
||||
"mov %[vsrc3].8b, v3.8b \t\n"
|
||||
: [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
|
||||
[vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
|
||||
[src] "+&r" (src)
|
||||
: : "v0", "v1", "v2", "v3"
|
||||
);
|
||||
|
||||
vsrc.val[0] = vsrc_0;
|
||||
vsrc.val[1] = vsrc_1;
|
||||
vsrc.val[2] = vsrc_2;
|
||||
vsrc.val[3] = vsrc_3;
|
||||
|
||||
return vsrc;
|
||||
}
|
||||
#endif
|
||||
|
||||
void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src, int count,
|
||||
U8CPU alpha, int /*x*/, int /*y*/) {
|
||||
@ -28,7 +74,12 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
|
||||
uint16x8_t vdst;
|
||||
|
||||
// Load
|
||||
#ifdef SK_CPU_ARM64
|
||||
vsrc = sk_vld4_u8_arm64_3(src);
|
||||
#else
|
||||
vsrc = vld4_u8((uint8_t*)src);
|
||||
src += 8;
|
||||
#endif
|
||||
|
||||
// Convert src to 565
|
||||
vdst = SkPixel32ToPixel16_neon8(vsrc);
|
||||
@ -38,7 +89,6 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
|
||||
|
||||
// Prepare next iteration
|
||||
dst += 8;
|
||||
src += 8;
|
||||
count -= 8;
|
||||
};
|
||||
|
||||
@ -64,11 +114,14 @@ void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
|
||||
vmask_blue = vmovq_n_u16(0x1F);
|
||||
|
||||
while (count >= 8) {
|
||||
uint8x8x4_t vsrc;
|
||||
uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
|
||||
uint16x8_t vres_r, vres_g, vres_b;
|
||||
uint8x8_t vsrc_r, vsrc_g, vsrc_b;
|
||||
|
||||
// Load src
|
||||
#ifdef SK_CPU_ARM64
|
||||
vsrc = sk_vld4_u8_arm64_3(src);
|
||||
#else
|
||||
{
|
||||
register uint8x8_t d0 asm("d0");
|
||||
register uint8x8_t d1 asm("d1");
|
||||
@ -80,13 +133,11 @@ void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
|
||||
: "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
|
||||
:
|
||||
);
|
||||
vsrc_g = d1;
|
||||
#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
|
||||
vsrc_r = d2; vsrc_b = d0;
|
||||
#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
|
||||
vsrc_r = d0; vsrc_b = d2;
|
||||
#endif
|
||||
vsrc.val[0] = d0;
|
||||
vsrc.val[1] = d1;
|
||||
vsrc.val[2] = d2;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Load and unpack dst
|
||||
vdst = vld1q_u16(dst);
|
||||
@ -95,15 +146,15 @@ void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
|
||||
vdst_r = vshrq_n_u16(vdst, 6+5); // extract red
|
||||
vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green
|
||||
|
||||
// Shift src to 565
|
||||
vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range
|
||||
vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range
|
||||
vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range
|
||||
// Shift src to 565 range
|
||||
vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
|
||||
vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
|
||||
vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
|
||||
|
||||
// Scale src - dst
|
||||
vres_r = vmovl_u8(vsrc_r) - vdst_r;
|
||||
vres_g = vmovl_u8(vsrc_g) - vdst_g;
|
||||
vres_b = vmovl_u8(vsrc_b) - vdst_b;
|
||||
vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
|
||||
vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
|
||||
vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
|
||||
|
||||
vres_r = vshrq_n_u16(vres_r * vscale, 8);
|
||||
vres_g = vshrq_n_u16(vres_g * vscale, 8);
|
||||
@ -136,6 +187,7 @@ void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SK_CPU_ARM32
|
||||
void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src, int count,
|
||||
U8CPU alpha, int /*x*/, int /*y*/) {
|
||||
@ -313,6 +365,7 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
|
||||
);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
|
||||
prod += vdupq_n_u16(128);
|
||||
@ -346,6 +399,9 @@ void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
|
||||
|
||||
// load pixels
|
||||
vdst = vld1q_u16(dst);
|
||||
#ifdef SK_CPU_ARM64
|
||||
vsrc = sk_vld4_u8_arm64_4(src);
|
||||
#else
|
||||
#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
|
||||
asm (
|
||||
"vld4.u8 %h[vsrc], [%[src]]!"
|
||||
@ -369,6 +425,7 @@ void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
|
||||
vsrc.val[2] = d2;
|
||||
vsrc.val[3] = d3;
|
||||
#endif
|
||||
#endif // #ifdef SK_CPU_ARM64
|
||||
|
||||
|
||||
// deinterleave dst
|
||||
@ -468,6 +525,7 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
|
||||
|
||||
do {
|
||||
|
||||
uint8x8x4_t vsrc;
|
||||
uint8x8_t vsrc_r, vsrc_g, vsrc_b;
|
||||
uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
|
||||
uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
|
||||
@ -478,6 +536,9 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
|
||||
int8x8_t vres8_r, vres8_g, vres8_b;
|
||||
|
||||
// Load source and add dither
|
||||
#ifdef SK_CPU_ARM64
|
||||
vsrc = sk_vld4_u8_arm64_3(src);
|
||||
#else
|
||||
{
|
||||
register uint8x8_t d0 asm("d0");
|
||||
register uint8x8_t d1 asm("d1");
|
||||
@ -485,17 +546,18 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
|
||||
register uint8x8_t d3 asm("d3");
|
||||
|
||||
asm (
|
||||
"vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
|
||||
"vld4.8 {d0-d3},[%[src]]! "
|
||||
: "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
|
||||
:
|
||||
);
|
||||
vsrc_g = d1;
|
||||
#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
|
||||
vsrc_r = d2; vsrc_b = d0;
|
||||
#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
|
||||
vsrc_r = d0; vsrc_b = d2;
|
||||
#endif
|
||||
vsrc.val[0] = d0;
|
||||
vsrc.val[1] = d1;
|
||||
vsrc.val[2] = d2;
|
||||
}
|
||||
#endif
|
||||
vsrc_r = vsrc.val[NEON_R];
|
||||
vsrc_g = vsrc.val[NEON_G];
|
||||
vsrc_b = vsrc.val[NEON_B];
|
||||
|
||||
vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
|
||||
vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
|
||||
@ -576,7 +638,6 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
|
||||
} while (--count != 0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
@ -1047,6 +1108,7 @@ static void showme16(char *str, void *p, int len)
|
||||
SkDebugf("%s\n", buf);
|
||||
}
|
||||
#endif
|
||||
#endif // #ifdef SK_CPU_ARM32
|
||||
|
||||
void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src,
|
||||
@ -1073,6 +1135,7 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
|
||||
dbase = vld1_u8(dstart);
|
||||
|
||||
do {
|
||||
uint8x8x4_t vsrc;
|
||||
uint8x8_t sr, sg, sb, sa, d;
|
||||
uint16x8_t dst8, scale8, alpha8;
|
||||
uint16x8_t dst_r, dst_g, dst_b;
|
||||
@ -1123,23 +1186,29 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef SK_CPU_ARM64
|
||||
vsrc = sk_vld4_u8_arm64_4(src);
|
||||
#else
|
||||
{
|
||||
register uint8x8_t d0 asm("d0");
|
||||
register uint8x8_t d1 asm("d1");
|
||||
register uint8x8_t d2 asm("d2");
|
||||
register uint8x8_t d3 asm("d3");
|
||||
|
||||
asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
|
||||
asm ("vld4.8 {d0-d3},[%[src]]! "
|
||||
: "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
|
||||
:
|
||||
);
|
||||
#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
|
||||
sr = d2; sg = d1; sb = d0; sa = d3;
|
||||
#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
|
||||
sr = d0; sg = d1; sb = d2; sa = d3;
|
||||
#endif
|
||||
vsrc.val[0] = d0;
|
||||
vsrc.val[1] = d1;
|
||||
vsrc.val[2] = d2;
|
||||
vsrc.val[3] = d3;
|
||||
}
|
||||
#endif
|
||||
sa = vsrc.val[NEON_A];
|
||||
sr = vsrc.val[NEON_R];
|
||||
sg = vsrc.val[NEON_G];
|
||||
sb = vsrc.val[NEON_B];
|
||||
|
||||
/* calculate 'd', which will be 0..7
|
||||
* dbase[] is 0..7; alpha is 0..256; 16 bits suffice
|
||||
@ -1283,7 +1352,11 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
|
||||
uint8x8_t sr, sg, sb;
|
||||
uint16x8_t dr, dg, db;
|
||||
uint16x8_t dst8;
|
||||
uint8x8x4_t vsrc;
|
||||
|
||||
#ifdef SK_CPU_ARM64
|
||||
vsrc = sk_vld4_u8_arm64_3(src);
|
||||
#else
|
||||
{
|
||||
register uint8x8_t d0 asm("d0");
|
||||
register uint8x8_t d1 asm("d1");
|
||||
@ -1291,17 +1364,19 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
|
||||
register uint8x8_t d3 asm("d3");
|
||||
|
||||
asm (
|
||||
"vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
|
||||
"vld4.8 {d0-d3},[%[src]]! "
|
||||
: "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
|
||||
:
|
||||
);
|
||||
sg = d1;
|
||||
#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
|
||||
sr = d2; sb = d0;
|
||||
#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
|
||||
sr = d0; sb = d2;
|
||||
#endif
|
||||
vsrc.val[0] = d0;
|
||||
vsrc.val[1] = d1;
|
||||
vsrc.val[2] = d2;
|
||||
}
|
||||
#endif
|
||||
sr = vsrc.val[NEON_R];
|
||||
sg = vsrc.val[NEON_G];
|
||||
sb = vsrc.val[NEON_B];
|
||||
|
||||
/* XXX: if we want to prefetch, hide it in the above asm()
|
||||
* using the gcc __builtin_prefetch(), the prefetch will
|
||||
* fall to the bottom of the loop -- it won't stick up
|
||||
@ -1369,7 +1444,6 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
|
||||
} while (--count != 0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
|
||||
SkPMColor color) {
|
||||
@ -1475,11 +1549,14 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
|
||||
#ifdef SK_CPU_ARM32
|
||||
// no dither
|
||||
S32_D565_Opaque_neon,
|
||||
S32_D565_Blend_neon,
|
||||
#ifdef SK_CPU_ARM32
|
||||
S32A_D565_Opaque_neon,
|
||||
#else
|
||||
NULL,
|
||||
#endif
|
||||
S32A_D565_Blend_neon,
|
||||
|
||||
// dither
|
||||
@ -1487,10 +1564,6 @@ const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
|
||||
S32_D565_Blend_Dither_neon,
|
||||
S32A_D565_Opaque_Dither_neon,
|
||||
NULL, // S32A_D565_Blend_Dither
|
||||
#else
|
||||
NULL, NULL, NULL, NULL,
|
||||
NULL, NULL, NULL, NULL
|
||||
#endif
|
||||
};
|
||||
|
||||
const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
|
||||
|
Loading…
Reference in New Issue
Block a user