glibc/sysdeps/aarch64/multiarch/memcpy_a64fx.S
Wilco Dijkstra b31bd11454 AArch64: Improve A64FX memcpy
v2 is a complete rewrite of the A64FX memcpy. Performance is improved
by streamlining the code, aligning all large copies and using a single
unrolled loop for all sizes. The code size for memcpy and memmove goes
down from 1796 bytes to 868 bytes. Performance is better in all cases:
bench-memcpy-random is 2.3% faster overall, bench-memcpy-large is ~33%
faster for large sizes, bench-memcpy-walk is 25% faster for small sizes
and 20% for the largest sizes. The geomean of all tests in bench-memcpy
is 5.1% faster, and total time is reduced by 4%.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
2021-12-02 18:36:03 +00:00

314 lines
7.2 KiB
ArmAsm

/* Optimized memcpy for Fujitsu A64FX processor.
Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#undef BTI_C
#define BTI_C
/* Assumptions:
*
* ARMv8.2-a, AArch64, unaligned accesses, sve
*
*/
#define dstin x0
#define src x1
#define n x2
#define dst x3
#define dstend x4
#define srcend x5
#define tmp x6
#define vlen x7
#define vlen8 x8
#if HAVE_AARCH64_SVE_ASM
# if IS_IN (libc)
# define MEMCPY __memcpy_a64fx
# define MEMMOVE __memmove_a64fx
.arch armv8.2-a+sve
.macro ld1b_unroll8
ld1b z0.b, p0/z, [src, 0, mul vl]
ld1b z1.b, p0/z, [src, 1, mul vl]
ld1b z2.b, p0/z, [src, 2, mul vl]
ld1b z3.b, p0/z, [src, 3, mul vl]
ld1b z4.b, p0/z, [src, 4, mul vl]
ld1b z5.b, p0/z, [src, 5, mul vl]
ld1b z6.b, p0/z, [src, 6, mul vl]
ld1b z7.b, p0/z, [src, 7, mul vl]
.endm
.macro stld1b_unroll4a
st1b z0.b, p0, [dst, 0, mul vl]
st1b z1.b, p0, [dst, 1, mul vl]
ld1b z0.b, p0/z, [src, 0, mul vl]
ld1b z1.b, p0/z, [src, 1, mul vl]
st1b z2.b, p0, [dst, 2, mul vl]
st1b z3.b, p0, [dst, 3, mul vl]
ld1b z2.b, p0/z, [src, 2, mul vl]
ld1b z3.b, p0/z, [src, 3, mul vl]
.endm
.macro stld1b_unroll4b
st1b z4.b, p0, [dst, 4, mul vl]
st1b z5.b, p0, [dst, 5, mul vl]
ld1b z4.b, p0/z, [src, 4, mul vl]
ld1b z5.b, p0/z, [src, 5, mul vl]
st1b z6.b, p0, [dst, 6, mul vl]
st1b z7.b, p0, [dst, 7, mul vl]
ld1b z6.b, p0/z, [src, 6, mul vl]
ld1b z7.b, p0/z, [src, 7, mul vl]
.endm
.macro stld1b_unroll8
stld1b_unroll4a
stld1b_unroll4b
.endm
.macro st1b_unroll8
st1b z0.b, p0, [dst, 0, mul vl]
st1b z1.b, p0, [dst, 1, mul vl]
st1b z2.b, p0, [dst, 2, mul vl]
st1b z3.b, p0, [dst, 3, mul vl]
st1b z4.b, p0, [dst, 4, mul vl]
st1b z5.b, p0, [dst, 5, mul vl]
st1b z6.b, p0, [dst, 6, mul vl]
st1b z7.b, p0, [dst, 7, mul vl]
.endm
#undef BTI_C
#define BTI_C
ENTRY (MEMCPY)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
cntb vlen
cmp n, vlen, lsl 1
b.hi L(copy_small)
whilelo p1.b, vlen, n
whilelo p0.b, xzr, n
ld1b z0.b, p0/z, [src, 0, mul vl]
ld1b z1.b, p1/z, [src, 1, mul vl]
st1b z0.b, p0, [dstin, 0, mul vl]
st1b z1.b, p1, [dstin, 1, mul vl]
ret
.p2align 4
L(copy_small):
cmp n, vlen, lsl 3
b.hi L(copy_large)
add dstend, dstin, n
add srcend, src, n
cmp n, vlen, lsl 2
b.hi 1f
/* Copy 2-4 vectors. */
ptrue p0.b
ld1b z0.b, p0/z, [src, 0, mul vl]
ld1b z1.b, p0/z, [src, 1, mul vl]
ld1b z2.b, p0/z, [srcend, -2, mul vl]
ld1b z3.b, p0/z, [srcend, -1, mul vl]
st1b z0.b, p0, [dstin, 0, mul vl]
st1b z1.b, p0, [dstin, 1, mul vl]
st1b z2.b, p0, [dstend, -2, mul vl]
st1b z3.b, p0, [dstend, -1, mul vl]
ret
.p2align 4
/* Copy 4-8 vectors. */
1: ptrue p0.b
ld1b z0.b, p0/z, [src, 0, mul vl]
ld1b z1.b, p0/z, [src, 1, mul vl]
ld1b z2.b, p0/z, [src, 2, mul vl]
ld1b z3.b, p0/z, [src, 3, mul vl]
ld1b z4.b, p0/z, [srcend, -4, mul vl]
ld1b z5.b, p0/z, [srcend, -3, mul vl]
ld1b z6.b, p0/z, [srcend, -2, mul vl]
ld1b z7.b, p0/z, [srcend, -1, mul vl]
st1b z0.b, p0, [dstin, 0, mul vl]
st1b z1.b, p0, [dstin, 1, mul vl]
st1b z2.b, p0, [dstin, 2, mul vl]
st1b z3.b, p0, [dstin, 3, mul vl]
st1b z4.b, p0, [dstend, -4, mul vl]
st1b z5.b, p0, [dstend, -3, mul vl]
st1b z6.b, p0, [dstend, -2, mul vl]
st1b z7.b, p0, [dstend, -1, mul vl]
ret
.p2align 4
/* At least 8 vectors - always align to vector length for
higher and consistent write performance. */
L(copy_large):
sub tmp, vlen, 1
and tmp, dstin, tmp
sub tmp, vlen, tmp
whilelo p1.b, xzr, tmp
ld1b z1.b, p1/z, [src]
st1b z1.b, p1, [dstin]
add dst, dstin, tmp
add src, src, tmp
sub n, n, tmp
ptrue p0.b
lsl vlen8, vlen, 3
subs n, n, vlen8
b.ls 3f
ld1b_unroll8
add src, src, vlen8
subs n, n, vlen8
b.ls 2f
.p2align 4
/* 8x unrolled and software pipelined loop. */
1: stld1b_unroll8
add dst, dst, vlen8
add src, src, vlen8
subs n, n, vlen8
b.hi 1b
2: st1b_unroll8
add dst, dst, vlen8
3: add n, n, vlen8
/* Move last 0-8 vectors. */
L(last_bytes):
cmp n, vlen, lsl 1
b.hi 1f
whilelo p0.b, xzr, n
whilelo p1.b, vlen, n
ld1b z0.b, p0/z, [src, 0, mul vl]
ld1b z1.b, p1/z, [src, 1, mul vl]
st1b z0.b, p0, [dst, 0, mul vl]
st1b z1.b, p1, [dst, 1, mul vl]
ret
.p2align 4
1: add srcend, src, n
add dstend, dst, n
ld1b z0.b, p0/z, [src, 0, mul vl]
ld1b z1.b, p0/z, [src, 1, mul vl]
ld1b z2.b, p0/z, [srcend, -2, mul vl]
ld1b z3.b, p0/z, [srcend, -1, mul vl]
cmp n, vlen, lsl 2
b.hi 1f
st1b z0.b, p0, [dst, 0, mul vl]
st1b z1.b, p0, [dst, 1, mul vl]
st1b z2.b, p0, [dstend, -2, mul vl]
st1b z3.b, p0, [dstend, -1, mul vl]
ret
1: ld1b z4.b, p0/z, [src, 2, mul vl]
ld1b z5.b, p0/z, [src, 3, mul vl]
ld1b z6.b, p0/z, [srcend, -4, mul vl]
ld1b z7.b, p0/z, [srcend, -3, mul vl]
st1b z0.b, p0, [dst, 0, mul vl]
st1b z1.b, p0, [dst, 1, mul vl]
st1b z4.b, p0, [dst, 2, mul vl]
st1b z5.b, p0, [dst, 3, mul vl]
st1b z6.b, p0, [dstend, -4, mul vl]
st1b z7.b, p0, [dstend, -3, mul vl]
st1b z2.b, p0, [dstend, -2, mul vl]
st1b z3.b, p0, [dstend, -1, mul vl]
ret
END (MEMCPY)
libc_hidden_builtin_def (MEMCPY)
ENTRY_ALIGN (MEMMOVE, 4)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
/* Fast case for up to 2 vectors. */
cntb vlen
cmp n, vlen, lsl 1
b.hi 1f
whilelo p0.b, xzr, n
whilelo p1.b, vlen, n
ld1b z0.b, p0/z, [src, 0, mul vl]
ld1b z1.b, p1/z, [src, 1, mul vl]
st1b z0.b, p0, [dstin, 0, mul vl]
st1b z1.b, p1, [dstin, 1, mul vl]
L(full_overlap):
ret
.p2align 4
/* Check for overlapping moves. Return if there is a full overlap.
Small moves up to 8 vectors use the overlap-safe copy_small code.
Non-overlapping or overlapping moves with dst < src use memcpy.
Overlapping moves with dst > src use a backward copy loop. */
1: sub tmp, dstin, src
ands tmp, tmp, 0xffffffffffffff /* Clear special tag bits. */
b.eq L(full_overlap)
cmp n, vlen, lsl 3
b.ls L(copy_small)
cmp tmp, n
b.hs L(copy_large)
/* Align to vector length. */
add dst, dstin, n
sub tmp, vlen, 1
ands tmp, dst, tmp
csel tmp, tmp, vlen, ne
whilelo p1.b, xzr, tmp
sub n, n, tmp
ld1b z1.b, p1/z, [src, n]
st1b z1.b, p1, [dstin, n]
add src, src, n
add dst, dstin, n
ptrue p0.b
lsl vlen8, vlen, 3
subs n, n, vlen8
b.ls 3f
sub src, src, vlen8
ld1b_unroll8
subs n, n, vlen8
b.ls 2f
.p2align 4
/* 8x unrolled and software pipelined backward copy loop. */
1: sub src, src, vlen8
sub dst, dst, vlen8
stld1b_unroll8
subs n, n, vlen8
b.hi 1b
2: sub dst, dst, vlen8
st1b_unroll8
3: add n, n, vlen8
/* Adjust src/dst for last 0-8 vectors. */
sub src, src, n
mov dst, dstin
b L(last_bytes)
END (MEMMOVE)
libc_hidden_builtin_def (MEMMOVE)
# endif /* IS_IN (libc) */
#endif /* HAVE_AARCH64_SVE_ASM */