ARM Skia NEON patches - 40 - arm64: S32A_D565_Opaque

Here are some perf results:

+-------+------------+------------+
| count | Cortex-A53 | Cortex-A57 |
+-------+------------+------------+
|     1 |     -2.54% |     -5.39% |
+-------+------------+------------+
|     2 |     -0.66% |     -2.08% |
+-------+------------+------------+
|     4 |    -11.13% |      0.00% |
+-------+------------+------------+
|     8 |     -5.79% |     -1.30% |
+-------+------------+------------+
|    16 |     71.60% |     93.27% |
+-------+------------+------------+
|    64 |     30.99% |     57.35% |
+-------+------------+------------+
|   256 |     25.41% |     52.59% |
+-------+------------+------------+
|  1024 |     25.56% |     53.76% |
+-------+------------+------------+

Signed-off-by: Kévin PETIT <kevin.petit@arm.com>

BUG=skia:
R=mtklein@google.com, djsollen@google.com

Author: kevin.petit@arm.com

Review URL: https://codereview.chromium.org/346843003
This commit is contained in:
kevin.petit 2014-06-26 02:51:29 -07:00 committed by Commit bot
parent 1ea04a0a4d
commit 0d44830309

View File

@ -365,7 +365,98 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
);
}
}
#endif
#else // #ifdef SK_CPU_ARM32
void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src, int count,
U8CPU alpha, int /*x*/, int /*y*/) {
SkASSERT(255 == alpha);
if (count >= 16) {
asm (
"movi v4.8h, #0x80 \t\n"
"1: \t\n"
"sub %[count], %[count], #16 \t\n"
"ld1 {v16.8h-v17.8h}, [%[dst]] \t\n"
"ld4 {v0.16b-v3.16b}, [%[src]], #64 \t\n"
"prfm pldl1keep, [%[src],#512] \t\n"
"prfm pldl1keep, [%[dst],#256] \t\n"
"ushr v20.8h, v17.8h, #5 \t\n"
"ushr v31.8h, v16.8h, #5 \t\n"
"xtn v6.8b, v31.8h \t\n"
"xtn2 v6.16b, v20.8h \t\n"
"ushr v20.8h, v17.8h, #11 \t\n"
"shl v19.16b, v6.16b, #2 \t\n"
"ushr v31.8h, v16.8h, #11 \t\n"
"xtn v22.8b, v31.8h \t\n"
"xtn2 v22.16b, v20.8h \t\n"
"shl v18.16b, v22.16b, #3 \t\n"
"mvn v3.16b, v3.16b \t\n"
"xtn v16.8b, v16.8h \t\n"
"mov v7.16b, v4.16b \t\n"
"xtn2 v16.16b, v17.8h \t\n"
"umlal v7.8h, v3.8b, v19.8b \t\n"
"shl v16.16b, v16.16b, #3 \t\n"
"mov v22.16b, v4.16b \t\n"
"ushr v24.8h, v7.8h, #6 \t\n"
"umlal v22.8h, v3.8b, v18.8b \t\n"
"ushr v20.8h, v22.8h, #5 \t\n"
"addhn v20.8b, v22.8h, v20.8h \t\n"
"cmp %[count], #16 \t\n"
"mov v6.16b, v4.16b \t\n"
"mov v5.16b, v4.16b \t\n"
"umlal v6.8h, v3.8b, v16.8b \t\n"
"umlal2 v5.8h, v3.16b, v19.16b \t\n"
"mov v17.16b, v4.16b \t\n"
"ushr v19.8h, v6.8h, #5 \t\n"
"umlal2 v17.8h, v3.16b, v18.16b \t\n"
"addhn v7.8b, v7.8h, v24.8h \t\n"
"ushr v18.8h, v5.8h, #6 \t\n"
"ushr v21.8h, v17.8h, #5 \t\n"
"addhn2 v7.16b, v5.8h, v18.8h \t\n"
"addhn2 v20.16b, v17.8h, v21.8h \t\n"
"mov v22.16b, v4.16b \t\n"
"addhn v6.8b, v6.8h, v19.8h \t\n"
"umlal2 v22.8h, v3.16b, v16.16b \t\n"
"ushr v5.8h, v22.8h, #5 \t\n"
"addhn2 v6.16b, v22.8h, v5.8h \t\n"
"uqadd v7.16b, v1.16b, v7.16b \t\n"
"uqadd v20.16b, v2.16b, v20.16b \t\n"
"uqadd v6.16b, v0.16b, v6.16b \t\n"
"shll v22.8h, v20.8b, #8 \t\n"
"shll v5.8h, v7.8b, #8 \t\n"
"sri v22.8h, v5.8h, #5 \t\n"
"shll v17.8h, v6.8b, #8 \t\n"
"shll2 v23.8h, v20.16b, #8 \t\n"
"shll2 v7.8h, v7.16b, #8 \t\n"
"sri v22.8h, v17.8h, #11 \t\n"
"sri v23.8h, v7.8h, #5 \t\n"
"shll2 v6.8h, v6.16b, #8 \t\n"
"st1 {v22.8h}, [%[dst]], #16 \t\n"
"sri v23.8h, v6.8h, #11 \t\n"
"st1 {v23.8h}, [%[dst]], #16 \t\n"
"b.ge 1b \t\n"
: [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
:: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v31"
);
}
// Leftovers
if (count > 0) {
do {
SkPMColor c = *src++;
SkPMColorAssert(c);
if (c) {
*dst = SkSrcOver32To16(c, *dst);
}
dst += 1;
} while (--count != 0);
}
}
#endif // #ifdef SK_CPU_ARM32
static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
prod += vdupq_n_u16(128);
@ -1552,11 +1643,7 @@ const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
// no dither
S32_D565_Opaque_neon,
S32_D565_Blend_neon,
#ifdef SK_CPU_ARM32
S32A_D565_Opaque_neon,
#else
NULL,
#endif
S32A_D565_Blend_neon,
// dither