mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-06 01:21:08 +00:00
381b29616a
This patch disables A64FX memcpy/memmove BTI instruction insertion
unconditionally such as A64FX memset patch [1] for performance.
[1] commit 07b427296b
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
410 lines
11 KiB
ArmAsm
410 lines
11 KiB
ArmAsm
/* Optimized memcpy for Fujitsu A64FX processor.
|
|
Copyright (C) 2021 Free Software Foundation, Inc.
|
|
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library. If not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#include <sysdep.h>
|
|
|
|
#undef BTI_C
|
|
#define BTI_C
|
|
|
|
/* Assumptions:
|
|
*
|
|
* ARMv8.2-a, AArch64, unaligned accesses, sve
|
|
*
|
|
*/
|
|
|
|
#define L2_SIZE (8*1024*1024)/2 // L2 8MB/2
|
|
#define CACHE_LINE_SIZE 256
|
|
#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance
|
|
#define dest x0
|
|
#define src x1
|
|
#define n x2 // size
|
|
#define tmp1 x3
|
|
#define tmp2 x4
|
|
#define tmp3 x5
|
|
#define rest x6
|
|
#define dest_ptr x7
|
|
#define src_ptr x8
|
|
#define vector_length x9
|
|
#define cl_remainder x10 // CACHE_LINE_SIZE remainder
|
|
|
|
#if HAVE_AARCH64_SVE_ASM
|
|
# if IS_IN (libc)
|
|
# define MEMCPY __memcpy_a64fx
|
|
# define MEMMOVE __memmove_a64fx
|
|
|
|
.arch armv8.2-a+sve
|
|
|
|
.macro dc_zva times
|
|
dc zva, tmp1
|
|
add tmp1, tmp1, CACHE_LINE_SIZE
|
|
.if \times-1
|
|
dc_zva "(\times-1)"
|
|
.endif
|
|
.endm
|
|
|
|
.macro ld1b_unroll8
|
|
ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
|
|
ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
|
|
ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
|
|
ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
|
|
ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
|
|
ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
|
|
ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
|
|
ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
|
|
.endm
|
|
|
|
.macro stld1b_unroll4a
|
|
st1b z0.b, p0, [dest_ptr, #0, mul vl]
|
|
st1b z1.b, p0, [dest_ptr, #1, mul vl]
|
|
ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
|
|
ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
|
|
st1b z2.b, p0, [dest_ptr, #2, mul vl]
|
|
st1b z3.b, p0, [dest_ptr, #3, mul vl]
|
|
ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
|
|
ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
|
|
.endm
|
|
|
|
.macro stld1b_unroll4b
|
|
st1b z4.b, p0, [dest_ptr, #4, mul vl]
|
|
st1b z5.b, p0, [dest_ptr, #5, mul vl]
|
|
ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
|
|
ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
|
|
st1b z6.b, p0, [dest_ptr, #6, mul vl]
|
|
st1b z7.b, p0, [dest_ptr, #7, mul vl]
|
|
ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
|
|
ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
|
|
.endm
|
|
|
|
.macro stld1b_unroll8
|
|
stld1b_unroll4a
|
|
stld1b_unroll4b
|
|
.endm
|
|
|
|
.macro st1b_unroll8
|
|
st1b z0.b, p0, [dest_ptr, #0, mul vl]
|
|
st1b z1.b, p0, [dest_ptr, #1, mul vl]
|
|
st1b z2.b, p0, [dest_ptr, #2, mul vl]
|
|
st1b z3.b, p0, [dest_ptr, #3, mul vl]
|
|
st1b z4.b, p0, [dest_ptr, #4, mul vl]
|
|
st1b z5.b, p0, [dest_ptr, #5, mul vl]
|
|
st1b z6.b, p0, [dest_ptr, #6, mul vl]
|
|
st1b z7.b, p0, [dest_ptr, #7, mul vl]
|
|
.endm
|
|
|
|
.macro shortcut_for_small_size exit
|
|
// if rest <= vector_length * 2
|
|
whilelo p0.b, xzr, n
|
|
whilelo p1.b, vector_length, n
|
|
b.last 1f
|
|
ld1b z0.b, p0/z, [src, #0, mul vl]
|
|
ld1b z1.b, p1/z, [src, #1, mul vl]
|
|
st1b z0.b, p0, [dest, #0, mul vl]
|
|
st1b z1.b, p1, [dest, #1, mul vl]
|
|
ret
|
|
1: // if rest > vector_length * 8
|
|
cmp n, vector_length, lsl 3 // vector_length * 8
|
|
b.hi \exit
|
|
// if rest <= vector_length * 4
|
|
lsl tmp1, vector_length, 1 // vector_length * 2
|
|
whilelo p2.b, tmp1, n
|
|
incb tmp1
|
|
whilelo p3.b, tmp1, n
|
|
b.last 1f
|
|
ld1b z0.b, p0/z, [src, #0, mul vl]
|
|
ld1b z1.b, p1/z, [src, #1, mul vl]
|
|
ld1b z2.b, p2/z, [src, #2, mul vl]
|
|
ld1b z3.b, p3/z, [src, #3, mul vl]
|
|
st1b z0.b, p0, [dest, #0, mul vl]
|
|
st1b z1.b, p1, [dest, #1, mul vl]
|
|
st1b z2.b, p2, [dest, #2, mul vl]
|
|
st1b z3.b, p3, [dest, #3, mul vl]
|
|
ret
|
|
1: // if rest <= vector_length * 8
|
|
lsl tmp1, vector_length, 2 // vector_length * 4
|
|
whilelo p4.b, tmp1, n
|
|
incb tmp1
|
|
whilelo p5.b, tmp1, n
|
|
b.last 1f
|
|
ld1b z0.b, p0/z, [src, #0, mul vl]
|
|
ld1b z1.b, p1/z, [src, #1, mul vl]
|
|
ld1b z2.b, p2/z, [src, #2, mul vl]
|
|
ld1b z3.b, p3/z, [src, #3, mul vl]
|
|
ld1b z4.b, p4/z, [src, #4, mul vl]
|
|
ld1b z5.b, p5/z, [src, #5, mul vl]
|
|
st1b z0.b, p0, [dest, #0, mul vl]
|
|
st1b z1.b, p1, [dest, #1, mul vl]
|
|
st1b z2.b, p2, [dest, #2, mul vl]
|
|
st1b z3.b, p3, [dest, #3, mul vl]
|
|
st1b z4.b, p4, [dest, #4, mul vl]
|
|
st1b z5.b, p5, [dest, #5, mul vl]
|
|
ret
|
|
1: lsl tmp1, vector_length, 2 // vector_length * 4
|
|
incb tmp1 // vector_length * 5
|
|
incb tmp1 // vector_length * 6
|
|
whilelo p6.b, tmp1, n
|
|
incb tmp1
|
|
whilelo p7.b, tmp1, n
|
|
ld1b z0.b, p0/z, [src, #0, mul vl]
|
|
ld1b z1.b, p1/z, [src, #1, mul vl]
|
|
ld1b z2.b, p2/z, [src, #2, mul vl]
|
|
ld1b z3.b, p3/z, [src, #3, mul vl]
|
|
ld1b z4.b, p4/z, [src, #4, mul vl]
|
|
ld1b z5.b, p5/z, [src, #5, mul vl]
|
|
ld1b z6.b, p6/z, [src, #6, mul vl]
|
|
ld1b z7.b, p7/z, [src, #7, mul vl]
|
|
st1b z0.b, p0, [dest, #0, mul vl]
|
|
st1b z1.b, p1, [dest, #1, mul vl]
|
|
st1b z2.b, p2, [dest, #2, mul vl]
|
|
st1b z3.b, p3, [dest, #3, mul vl]
|
|
st1b z4.b, p4, [dest, #4, mul vl]
|
|
st1b z5.b, p5, [dest, #5, mul vl]
|
|
st1b z6.b, p6, [dest, #6, mul vl]
|
|
st1b z7.b, p7, [dest, #7, mul vl]
|
|
ret
|
|
.endm
|
|
|
|
ENTRY (MEMCPY)
|
|
|
|
PTR_ARG (0)
|
|
PTR_ARG (1)
|
|
SIZE_ARG (2)
|
|
|
|
L(memcpy):
|
|
cntb vector_length
|
|
// shortcut for less than vector_length * 8
|
|
// gives a free ptrue to p0.b for n >= vector_length
|
|
shortcut_for_small_size L(vl_agnostic)
|
|
// end of shortcut
|
|
|
|
L(vl_agnostic): // VL Agnostic
|
|
mov rest, n
|
|
mov dest_ptr, dest
|
|
mov src_ptr, src
|
|
// if rest >= L2_SIZE && vector_length == 64 then L(L2)
|
|
mov tmp1, 64
|
|
cmp rest, L2_SIZE
|
|
ccmp vector_length, tmp1, 0, cs
|
|
b.eq L(L2)
|
|
|
|
L(unroll8): // unrolling and software pipeline
|
|
lsl tmp1, vector_length, 3 // vector_length * 8
|
|
.p2align 3
|
|
cmp rest, tmp1
|
|
b.cc L(last)
|
|
ld1b_unroll8
|
|
add src_ptr, src_ptr, tmp1
|
|
sub rest, rest, tmp1
|
|
cmp rest, tmp1
|
|
b.cc 2f
|
|
.p2align 3
|
|
1: stld1b_unroll8
|
|
add dest_ptr, dest_ptr, tmp1
|
|
add src_ptr, src_ptr, tmp1
|
|
sub rest, rest, tmp1
|
|
cmp rest, tmp1
|
|
b.ge 1b
|
|
2: st1b_unroll8
|
|
add dest_ptr, dest_ptr, tmp1
|
|
|
|
.p2align 3
|
|
L(last):
|
|
whilelo p0.b, xzr, rest
|
|
whilelo p1.b, vector_length, rest
|
|
b.last 1f
|
|
ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
|
|
ld1b z1.b, p1/z, [src_ptr, #1, mul vl]
|
|
st1b z0.b, p0, [dest_ptr, #0, mul vl]
|
|
st1b z1.b, p1, [dest_ptr, #1, mul vl]
|
|
ret
|
|
1: lsl tmp1, vector_length, 1 // vector_length * 2
|
|
whilelo p2.b, tmp1, rest
|
|
incb tmp1
|
|
whilelo p3.b, tmp1, rest
|
|
b.last 1f
|
|
ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
|
|
ld1b z1.b, p1/z, [src_ptr, #1, mul vl]
|
|
ld1b z2.b, p2/z, [src_ptr, #2, mul vl]
|
|
ld1b z3.b, p3/z, [src_ptr, #3, mul vl]
|
|
st1b z0.b, p0, [dest_ptr, #0, mul vl]
|
|
st1b z1.b, p1, [dest_ptr, #1, mul vl]
|
|
st1b z2.b, p2, [dest_ptr, #2, mul vl]
|
|
st1b z3.b, p3, [dest_ptr, #3, mul vl]
|
|
ret
|
|
1: lsl tmp1, vector_length, 2 // vector_length * 4
|
|
whilelo p4.b, tmp1, rest
|
|
incb tmp1
|
|
whilelo p5.b, tmp1, rest
|
|
incb tmp1
|
|
whilelo p6.b, tmp1, rest
|
|
incb tmp1
|
|
whilelo p7.b, tmp1, rest
|
|
ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
|
|
ld1b z1.b, p1/z, [src_ptr, #1, mul vl]
|
|
ld1b z2.b, p2/z, [src_ptr, #2, mul vl]
|
|
ld1b z3.b, p3/z, [src_ptr, #3, mul vl]
|
|
ld1b z4.b, p4/z, [src_ptr, #4, mul vl]
|
|
ld1b z5.b, p5/z, [src_ptr, #5, mul vl]
|
|
ld1b z6.b, p6/z, [src_ptr, #6, mul vl]
|
|
ld1b z7.b, p7/z, [src_ptr, #7, mul vl]
|
|
st1b z0.b, p0, [dest_ptr, #0, mul vl]
|
|
st1b z1.b, p1, [dest_ptr, #1, mul vl]
|
|
st1b z2.b, p2, [dest_ptr, #2, mul vl]
|
|
st1b z3.b, p3, [dest_ptr, #3, mul vl]
|
|
st1b z4.b, p4, [dest_ptr, #4, mul vl]
|
|
st1b z5.b, p5, [dest_ptr, #5, mul vl]
|
|
st1b z6.b, p6, [dest_ptr, #6, mul vl]
|
|
st1b z7.b, p7, [dest_ptr, #7, mul vl]
|
|
ret
|
|
|
|
L(L2):
|
|
// align dest address at CACHE_LINE_SIZE byte boundary
|
|
mov tmp1, CACHE_LINE_SIZE
|
|
ands tmp2, dest_ptr, CACHE_LINE_SIZE - 1
|
|
// if cl_remainder == 0
|
|
b.eq L(L2_dc_zva)
|
|
sub cl_remainder, tmp1, tmp2
|
|
// process remainder until the first CACHE_LINE_SIZE boundary
|
|
whilelo p1.b, xzr, cl_remainder // keep p0.b all true
|
|
whilelo p2.b, vector_length, cl_remainder
|
|
b.last 1f
|
|
ld1b z1.b, p1/z, [src_ptr, #0, mul vl]
|
|
ld1b z2.b, p2/z, [src_ptr, #1, mul vl]
|
|
st1b z1.b, p1, [dest_ptr, #0, mul vl]
|
|
st1b z2.b, p2, [dest_ptr, #1, mul vl]
|
|
b 2f
|
|
1: lsl tmp1, vector_length, 1 // vector_length * 2
|
|
whilelo p3.b, tmp1, cl_remainder
|
|
incb tmp1
|
|
whilelo p4.b, tmp1, cl_remainder
|
|
ld1b z1.b, p1/z, [src_ptr, #0, mul vl]
|
|
ld1b z2.b, p2/z, [src_ptr, #1, mul vl]
|
|
ld1b z3.b, p3/z, [src_ptr, #2, mul vl]
|
|
ld1b z4.b, p4/z, [src_ptr, #3, mul vl]
|
|
st1b z1.b, p1, [dest_ptr, #0, mul vl]
|
|
st1b z2.b, p2, [dest_ptr, #1, mul vl]
|
|
st1b z3.b, p3, [dest_ptr, #2, mul vl]
|
|
st1b z4.b, p4, [dest_ptr, #3, mul vl]
|
|
2: add dest_ptr, dest_ptr, cl_remainder
|
|
add src_ptr, src_ptr, cl_remainder
|
|
sub rest, rest, cl_remainder
|
|
|
|
L(L2_dc_zva):
|
|
// zero fill
|
|
and tmp1, dest, 0xffffffffffffff
|
|
and tmp2, src, 0xffffffffffffff
|
|
subs tmp1, tmp1, tmp2 // diff
|
|
b.ge 1f
|
|
neg tmp1, tmp1
|
|
1: mov tmp3, ZF_DIST + CACHE_LINE_SIZE * 2
|
|
cmp tmp1, tmp3
|
|
b.lo L(unroll8)
|
|
mov tmp1, dest_ptr
|
|
dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1
|
|
// unroll
|
|
ld1b_unroll8 // this line has to be after "b.lo L(unroll8)"
|
|
add src_ptr, src_ptr, CACHE_LINE_SIZE * 2
|
|
sub rest, rest, CACHE_LINE_SIZE * 2
|
|
mov tmp1, ZF_DIST
|
|
.p2align 3
|
|
1: stld1b_unroll4a
|
|
add tmp2, dest_ptr, tmp1 // dest_ptr + ZF_DIST
|
|
dc zva, tmp2
|
|
stld1b_unroll4b
|
|
add tmp2, tmp2, CACHE_LINE_SIZE
|
|
dc zva, tmp2
|
|
add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
|
|
add src_ptr, src_ptr, CACHE_LINE_SIZE * 2
|
|
sub rest, rest, CACHE_LINE_SIZE * 2
|
|
cmp rest, tmp3 // ZF_DIST + CACHE_LINE_SIZE * 2
|
|
b.ge 1b
|
|
st1b_unroll8
|
|
add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
|
|
b L(unroll8)
|
|
|
|
END (MEMCPY)
|
|
libc_hidden_builtin_def (MEMCPY)
|
|
|
|
|
|
ENTRY (MEMMOVE)
|
|
|
|
PTR_ARG (0)
|
|
PTR_ARG (1)
|
|
SIZE_ARG (2)
|
|
|
|
// remove tag address
|
|
// dest has to be immutable because it is the return value
|
|
// src has to be immutable because it is used in L(bwd_last)
|
|
and tmp2, dest, 0xffffffffffffff // save dest_notag into tmp2
|
|
and tmp3, src, 0xffffffffffffff // save src_notag intp tmp3
|
|
cmp n, 0
|
|
ccmp tmp2, tmp3, 4, ne
|
|
b.ne 1f
|
|
ret
|
|
1: cntb vector_length
|
|
// shortcut for less than vector_length * 8
|
|
// gives a free ptrue to p0.b for n >= vector_length
|
|
// tmp2 and tmp3 should not be used in this macro to keep
|
|
// notag addresses
|
|
shortcut_for_small_size L(dispatch)
|
|
// end of shortcut
|
|
|
|
L(dispatch):
|
|
// tmp2 = dest_notag, tmp3 = src_notag
|
|
// diff = dest_notag - src_notag
|
|
sub tmp1, tmp2, tmp3
|
|
// if diff <= 0 || diff >= n then memcpy
|
|
cmp tmp1, 0
|
|
ccmp tmp1, n, 2, gt
|
|
b.cs L(vl_agnostic)
|
|
|
|
L(bwd_start):
|
|
mov rest, n
|
|
add dest_ptr, dest, n // dest_end
|
|
add src_ptr, src, n // src_end
|
|
|
|
L(bwd_unroll8): // unrolling and software pipeline
|
|
lsl tmp1, vector_length, 3 // vector_length * 8
|
|
.p2align 3
|
|
cmp rest, tmp1
|
|
b.cc L(bwd_last)
|
|
sub src_ptr, src_ptr, tmp1
|
|
ld1b_unroll8
|
|
sub rest, rest, tmp1
|
|
cmp rest, tmp1
|
|
b.cc 2f
|
|
.p2align 3
|
|
1: sub src_ptr, src_ptr, tmp1
|
|
sub dest_ptr, dest_ptr, tmp1
|
|
stld1b_unroll8
|
|
sub rest, rest, tmp1
|
|
cmp rest, tmp1
|
|
b.ge 1b
|
|
2: sub dest_ptr, dest_ptr, tmp1
|
|
st1b_unroll8
|
|
|
|
L(bwd_last):
|
|
mov dest_ptr, dest
|
|
mov src_ptr, src
|
|
b L(last)
|
|
|
|
END (MEMMOVE)
|
|
libc_hidden_builtin_def (MEMMOVE)
|
|
# endif /* IS_IN (libc) */
|
|
#endif /* HAVE_AARCH64_SVE_ASM */
|