ARM Skia NEON patches - 41 - arm64: SkXfermode::xfer32
Currently the NEON code for Xfermodes performs well on arm64 targets except for dstout and dstin which are significantly slower than the C code. This patch fixes this and gives further improvements on other modes. Here are some perf results: +------------+------------+------------+ | mode | Cortex-A53 | Cortex-A57 | +------------+------------+------------+ | multiply | +24.58% | +23.71% | +------------+------------+------------+ | exclusion | +22.72% | +22.05% | +------------+------------+------------+ | difference | +34.67% | +36.82% | +------------+------------+------------+ | hardlight | +17.07% | +14.74% | +------------+------------+------------+ | lighten | +38.21% | +32.87% | +------------+------------+------------+ | darken | +37.59% | +32.99% | +------------+------------+------------+ | overlay | +17.36% | +16.88% | +------------+------------+------------+ | screen | +52.56% | +54.43% | +------------+------------+------------+ | modulate | +62.85% | +61.32% | +------------+------------+------------+ | plus | +91.52% | +117.41% | +------------+------------+------------+ | xor | +42.86% | +43.38% | +------------+------------+------------+ | dstatop | +48.46% | +48.99% | +------------+------------+------------+ | srcatop | +50.50% | +48.51% | +------------+------------+------------+ | dstout | +67.83% | +78.09% | +------------+------------+------------+ | srcout | +69.02% | +78.26% | +------------+------------+------------+ | dstin | +70.92% | +79.24% | +------------+------------+------------+ | srcin | +68.90% | +78.23% | +------------+------------+------------+ | dstover | +73.80% | +68.10% | +------------+------------+------------+ Signed-off-by: Kévin PETIT <kevin.petit@arm.com> BUG=skia R=mtklein@google.com, djsollen@google.com Author: kevin.petit@arm.com Review URL: https://codereview.chromium.org/350343002
This commit is contained in:
parent
5f6102d079
commit
4dc94d9dfc
@ -748,8 +748,9 @@ SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer)
|
||||
fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]);
|
||||
}
|
||||
|
||||
void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
|
||||
int count, const SkAlpha aa[]) const {
|
||||
void SkNEONProcCoeffXfermode::xfer32(SkPMColor* SK_RESTRICT dst,
|
||||
const SkPMColor* SK_RESTRICT src, int count,
|
||||
const SkAlpha* SK_RESTRICT aa) const {
|
||||
SkASSERT(dst && src && count >= 0);
|
||||
|
||||
SkXfermodeProc proc = this->getProc();
|
||||
@ -758,13 +759,16 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
|
||||
|
||||
if (NULL == aa) {
|
||||
// Unrolled NEON code
|
||||
// We'd like to just do this (modulo a few casts):
|
||||
// vst4_u8(dst, procSIMD(vld4_u8(src), vld4_u8(dst)));
|
||||
// src += 8;
|
||||
// dst += 8;
|
||||
// but that tends to generate miserable code. Here are a bunch of faster
|
||||
// workarounds for different architectures and compilers.
|
||||
while (count >= 8) {
|
||||
uint8x8x4_t vsrc, vdst, vres;
|
||||
|
||||
#ifdef SK_CPU_ARM64
|
||||
vsrc = vld4_u8((uint8_t*)src);
|
||||
vdst = vld4_u8((uint8_t*)dst);
|
||||
#else
|
||||
#ifdef SK_CPU_ARM32
|
||||
uint8x8x4_t vsrc, vdst, vres;
|
||||
#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
|
||||
asm volatile (
|
||||
"vld4.u8 %h[vsrc], [%[src]]! \t\n"
|
||||
@ -797,17 +801,36 @@ void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
|
||||
vsrc.val[2] = d2; vdst.val[2] = d6;
|
||||
vsrc.val[3] = d3; vdst.val[3] = d7;
|
||||
#endif
|
||||
#endif // #ifdef SK_CPU_ARM64
|
||||
|
||||
vres = procSIMD(vsrc, vdst);
|
||||
|
||||
vst4_u8((uint8_t*)dst, vres);
|
||||
|
||||
count -= 8;
|
||||
dst += 8;
|
||||
#ifdef SK_CPU_ARM64
|
||||
src += 8;
|
||||
#endif
|
||||
|
||||
#else // #ifdef SK_CPU_ARM32
|
||||
|
||||
asm volatile (
|
||||
"ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
|
||||
"ld4 {v4.8b - v7.8b}, [%[dst]] \t\n"
|
||||
"blr %[proc] \t\n"
|
||||
"st4 {v0.8b - v3.8b}, [%[dst]], #32 \t\n"
|
||||
: [src] "+&r" (src), [dst] "+&r" (dst)
|
||||
: [proc] "r" (procSIMD)
|
||||
: "cc", "memory",
|
||||
/* We don't know what proc is going to clobber so we must
|
||||
* add everything that is not callee-saved.
|
||||
*/
|
||||
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
|
||||
"x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18",
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
|
||||
"v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||
"v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
|
||||
#endif // #ifdef SK_CPU_ARM32
|
||||
|
||||
count -= 8;
|
||||
}
|
||||
// Leftovers
|
||||
for (int i = 0; i < count; i++) {
|
||||
|
Loading…
Reference in New Issue
Block a user