ARM Skia NEON patches - 39 - arm64 565 blitters

This enables all 565 blitters except S32A_D565_Opaque.

Here are some performance results:

S32_D565_Opaque:
================

+-------+------------+------------+
| count | Cortex-A53 | Cortex-A57 |
+-------+------------+------------+
|     1 |    -18.37% |    -13.04% |
+-------+------------+------------+
|     2 |     -9.90% |    -13.78% |
+-------+------------+------------+
|     4 |     -8.28% |     -6.77% |
+-------+------------+------------+
|     8 |    157.63% |     78.15% |
+-------+------------+------------+
|    16 |     72.67% |     44.81% |
+-------+------------+------------+
|    64 |     76.78% |     40.89% |
+-------+------------+------------+
|   256 |     73.85% |     36.05% |
+-------+------------+------------+
|  1024 |     75.73% |     36.70% |
+-------+------------+------------+

S32_D565_Blend:
===============

+-------+------------+------------+
| count | Cortex-A53 | Cortex-A57 |
+-------+------------+------------+
|     1 |     -9.99% |    -13.79% |
+-------+------------+------------+
|     2 |     -9.17% |     -6.74% |
+-------+------------+------------+
|     4 |     -6.73% |     -4.42% |
+-------+------------+------------+
|     8 |    163.31% |    112.82% |
+-------+------------+------------+
|    16 |     55.21% |     44.68% |
+-------+------------+------------+
|    64 |     54.09% |     41.99% |
+-------+------------+------------+
|   256 |     52.63% |     40.64% |
+-------+------------+------------+
|  1024 |     52.46% |     40.45% |
+-------+------------+------------+

S32A_D565_Blend:
================

+-------+------------+------------+
| count | Cortex-A53 | Cortex-A57 |
+-------+------------+------------+
|     1 |     -5.88% |     -6.06% |
+-------+------------+------------+
|     2 |     -4.74% |     -0.01% |
+-------+------------+------------+
|     4 |     -5.42% |     -3.03% |
+-------+------------+------------+
|     8 |     78.78% |     77.96% |
+-------+------------+------------+
|    16 |     98.19% |     79.61% |
+-------+------------+------------+
|    64 |    111.56% |     72.60% |
+-------+------------+------------+
|   256 |    113.80% |     69.96% |
+-------+------------+------------+
|  1024 |    114.42% |     70.85% |
+-------+------------+------------+

S32_D565_Opaque_Dither:
=======================

+-------+------------+------------+
| count | Cortex-A53 | Cortex-A57 |
+-------+------------+------------+
|     1 |     -4.18% |     -0.93% |
+-------+------------+------------+
|     2 |     -2.43% |     -2.04% |
+-------+------------+------------+
|     4 |     -1.09% |     -1.23% |
+-------+------------+------------+
|     8 |    184.89% |    136.53% |
+-------+------------+------------+
|    16 |    128.64% |     89.11% |
+-------+------------+------------+
|    64 |    132.68% |    100.98% |
+-------+------------+------------+
|   256 |    157.02% |    100.86% |
+-------+------------+------------+
|  1024 |    163.85% |    103.62% |
+-------+------------+------------+

S32_D565_Blend_Dither:
======================

+-------+------------+------------+
| count | Cortex-A53 | Cortex-A57 |
+-------+------------+------------+
|     1 |     -4.87% |      0.01% |
+-------+------------+------------+
|     2 |     -2.71% |      2.97% |
+-------+------------+------------+
|     4 |     -2.20% |      0.28% |
+-------+------------+------------+
|     8 |    149.76% |    146.80% |
+-------+------------+------------+
|    16 |     85.69% |     95.77% |
+-------+------------+------------+
|    64 |     88.81% |    101.39% |
+-------+------------+------------+
|   256 |     97.32% |    107.22% |
+-------+------------+------------+
|  1024 |     98.08% |    115.71% |
+-------+------------+------------+

S32A_D565_Opaque_Dither:
========================

+-------+------------+------------+
| count | Cortex-A53 | Cortex-A57 |
+-------+------------+------------+
|     1 |     -1.86% |      0.02% |
+-------+------------+------------+
|     2 |     -0.58% |     -1.52% |
+-------+------------+------------+
|     4 |     -0.75% |      1.16% |
+-------+------------+------------+
|     8 |    240.74% |    155.16% |
+-------+------------+------------+
|    16 |    181.97% |    132.15% |
+-------+------------+------------+
|    64 |    203.11% |    136.48% |
+-------+------------+------------+
|   256 |    223.45% |    133.05% |
+-------+------------+------------+
|  1024 |    225.96% |    134.05% |
+-------+------------+------------+

Signed-off-by: Kévin PETIT <kevin.petit@arm.com>

BUG=skia:
R=djsollen@google.com, mtklein@google.com

Author: kevin.petit@arm.com

Review URL: https://codereview.chromium.org/317193003
This commit is contained in:
kevin.petit 2014-06-06 08:45:36 -07:00 committed by Commit bot
parent 2e44b51b9a
commit ea6b46b6c0
2 changed files with 122 additions and 44 deletions

View File

@ -47,3 +47,8 @@ tilemode_bitmap
tilemodes_npot
tilemodes
# Added by kevin.petit@arm.com for https://codereview.chromium.org/317193003
# Only S32A_D565_Blend is causing mismatches
shadertext3
drawbitmaprect
coloremoji

View File

@ -17,7 +17,53 @@
#include "SkColor_opts_neon.h"
#include <arm_neon.h>
#ifdef SK_CPU_ARM32
#ifdef SK_CPU_ARM64
static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
uint8x8x4_t vsrc;
uint8x8_t vsrc_0, vsrc_1, vsrc_2;
asm (
"ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
"mov %[vsrc0].8b, v0.8b \t\n"
"mov %[vsrc1].8b, v1.8b \t\n"
"mov %[vsrc2].8b, v2.8b \t\n"
: [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
[vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
: : "v0", "v1", "v2", "v3"
);
vsrc.val[0] = vsrc_0;
vsrc.val[1] = vsrc_1;
vsrc.val[2] = vsrc_2;
return vsrc;
}
static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
uint8x8x4_t vsrc;
uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
asm (
"ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
"mov %[vsrc0].8b, v0.8b \t\n"
"mov %[vsrc1].8b, v1.8b \t\n"
"mov %[vsrc2].8b, v2.8b \t\n"
"mov %[vsrc3].8b, v3.8b \t\n"
: [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
[vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
[src] "+&r" (src)
: : "v0", "v1", "v2", "v3"
);
vsrc.val[0] = vsrc_0;
vsrc.val[1] = vsrc_1;
vsrc.val[2] = vsrc_2;
vsrc.val[3] = vsrc_3;
return vsrc;
}
#endif
void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src, int count,
U8CPU alpha, int /*x*/, int /*y*/) {
@ -28,7 +74,12 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
uint16x8_t vdst;
// Load
#ifdef SK_CPU_ARM64
vsrc = sk_vld4_u8_arm64_3(src);
#else
vsrc = vld4_u8((uint8_t*)src);
src += 8;
#endif
// Convert src to 565
vdst = SkPixel32ToPixel16_neon8(vsrc);
@ -38,7 +89,6 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
// Prepare next iteration
dst += 8;
src += 8;
count -= 8;
};
@ -64,11 +114,14 @@ void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
vmask_blue = vmovq_n_u16(0x1F);
while (count >= 8) {
uint8x8x4_t vsrc;
uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
uint16x8_t vres_r, vres_g, vres_b;
uint8x8_t vsrc_r, vsrc_g, vsrc_b;
// Load src
#ifdef SK_CPU_ARM64
vsrc = sk_vld4_u8_arm64_3(src);
#else
{
register uint8x8_t d0 asm("d0");
register uint8x8_t d1 asm("d1");
@ -80,13 +133,11 @@ void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
: "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
:
);
vsrc_g = d1;
#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
vsrc_r = d2; vsrc_b = d0;
#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
vsrc_r = d0; vsrc_b = d2;
#endif
vsrc.val[0] = d0;
vsrc.val[1] = d1;
vsrc.val[2] = d2;
}
#endif
// Load and unpack dst
vdst = vld1q_u16(dst);
@ -95,15 +146,15 @@ void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
vdst_r = vshrq_n_u16(vdst, 6+5); // extract red
vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green
// Shift src to 565
vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range
vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range
vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range
// Shift src to 565 range
vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
// Scale src - dst
vres_r = vmovl_u8(vsrc_r) - vdst_r;
vres_g = vmovl_u8(vsrc_g) - vdst_g;
vres_b = vmovl_u8(vsrc_b) - vdst_b;
vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
vres_r = vshrq_n_u16(vres_r * vscale, 8);
vres_g = vshrq_n_u16(vres_g * vscale, 8);
@ -136,6 +187,7 @@ void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
}
}
#ifdef SK_CPU_ARM32
void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src, int count,
U8CPU alpha, int /*x*/, int /*y*/) {
@ -313,6 +365,7 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
);
}
}
#endif
static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
prod += vdupq_n_u16(128);
@ -346,6 +399,9 @@ void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
// load pixels
vdst = vld1q_u16(dst);
#ifdef SK_CPU_ARM64
vsrc = sk_vld4_u8_arm64_4(src);
#else
#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
asm (
"vld4.u8 %h[vsrc], [%[src]]!"
@ -369,6 +425,7 @@ void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
vsrc.val[2] = d2;
vsrc.val[3] = d3;
#endif
#endif // #ifdef SK_CPU_ARM64
// deinterleave dst
@ -468,6 +525,7 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
do {
uint8x8x4_t vsrc;
uint8x8_t vsrc_r, vsrc_g, vsrc_b;
uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
@ -478,6 +536,9 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
int8x8_t vres8_r, vres8_g, vres8_b;
// Load source and add dither
#ifdef SK_CPU_ARM64
vsrc = sk_vld4_u8_arm64_3(src);
#else
{
register uint8x8_t d0 asm("d0");
register uint8x8_t d1 asm("d1");
@ -485,17 +546,18 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
register uint8x8_t d3 asm("d3");
asm (
"vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
"vld4.8 {d0-d3},[%[src]]! "
: "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
:
);
vsrc_g = d1;
#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
vsrc_r = d2; vsrc_b = d0;
#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
vsrc_r = d0; vsrc_b = d2;
#endif
vsrc.val[0] = d0;
vsrc.val[1] = d1;
vsrc.val[2] = d2;
}
#endif
vsrc_r = vsrc.val[NEON_R];
vsrc_g = vsrc.val[NEON_G];
vsrc_b = vsrc.val[NEON_B];
vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
@ -576,7 +638,6 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
} while (--count != 0);
}
}
#endif
void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
@ -1047,6 +1108,7 @@ static void showme16(char *str, void *p, int len)
SkDebugf("%s\n", buf);
}
#endif
#endif // #ifdef SK_CPU_ARM32
void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
@ -1073,6 +1135,7 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
dbase = vld1_u8(dstart);
do {
uint8x8x4_t vsrc;
uint8x8_t sr, sg, sb, sa, d;
uint16x8_t dst8, scale8, alpha8;
uint16x8_t dst_r, dst_g, dst_b;
@ -1123,23 +1186,29 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
}
#endif
#ifdef SK_CPU_ARM64
vsrc = sk_vld4_u8_arm64_4(src);
#else
{
register uint8x8_t d0 asm("d0");
register uint8x8_t d1 asm("d1");
register uint8x8_t d2 asm("d2");
register uint8x8_t d3 asm("d3");
asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
asm ("vld4.8 {d0-d3},[%[src]]! "
: "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
:
);
#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
sr = d2; sg = d1; sb = d0; sa = d3;
#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
sr = d0; sg = d1; sb = d2; sa = d3;
#endif
vsrc.val[0] = d0;
vsrc.val[1] = d1;
vsrc.val[2] = d2;
vsrc.val[3] = d3;
}
#endif
sa = vsrc.val[NEON_A];
sr = vsrc.val[NEON_R];
sg = vsrc.val[NEON_G];
sb = vsrc.val[NEON_B];
/* calculate 'd', which will be 0..7
* dbase[] is 0..7; alpha is 0..256; 16 bits suffice
@ -1283,7 +1352,11 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
uint8x8_t sr, sg, sb;
uint16x8_t dr, dg, db;
uint16x8_t dst8;
uint8x8x4_t vsrc;
#ifdef SK_CPU_ARM64
vsrc = sk_vld4_u8_arm64_3(src);
#else
{
register uint8x8_t d0 asm("d0");
register uint8x8_t d1 asm("d1");
@ -1291,17 +1364,19 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
register uint8x8_t d3 asm("d3");
asm (
"vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
"vld4.8 {d0-d3},[%[src]]! "
: "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
:
);
sg = d1;
#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
sr = d2; sb = d0;
#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
sr = d0; sb = d2;
#endif
vsrc.val[0] = d0;
vsrc.val[1] = d1;
vsrc.val[2] = d2;
}
#endif
sr = vsrc.val[NEON_R];
sg = vsrc.val[NEON_G];
sb = vsrc.val[NEON_B];
/* XXX: if we want to prefetch, hide it in the above asm()
* using the gcc __builtin_prefetch(), the prefetch will
* fall to the bottom of the loop -- it won't stick up
@ -1369,7 +1444,6 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
} while (--count != 0);
}
}
#endif
void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
SkPMColor color) {
@ -1475,11 +1549,14 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
///////////////////////////////////////////////////////////////////////////////
const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
#ifdef SK_CPU_ARM32
// no dither
S32_D565_Opaque_neon,
S32_D565_Blend_neon,
#ifdef SK_CPU_ARM32
S32A_D565_Opaque_neon,
#else
NULL,
#endif
S32A_D565_Blend_neon,
// dither
@ -1487,10 +1564,6 @@ const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
S32_D565_Blend_Dither_neon,
S32A_D565_Opaque_Dither_neon,
NULL, // S32A_D565_Blend_Dither
#else
NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL
#endif
};
const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {