AArch64: Add memset_zva64

Add a specialized memset for the common ZVA size of 64 to avoid the
overhead of reading the ZVA size.  Since the code is identical to
__memset_falkor, remove the latter.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
(cherry picked from commit 3d7090f14b)
This commit is contained in:
Wilco Dijkstra 2023-10-26 17:07:21 +01:00
parent 7e999181c2
commit ff17116c1e
6 changed files with 38 additions and 68 deletions

View File

@ -101,19 +101,19 @@ L(tail64):
ret
L(try_zva):
#ifdef ZVA_MACRO
zva_macro
#else
#ifndef ZVA64_ONLY
.p2align 3
mrs tmp1, dczid_el0
tbnz tmp1w, 4, L(no_zva)
and tmp1w, tmp1w, 15
cmp tmp1w, 4 /* ZVA size is 64 bytes. */
b.ne L(zva_128)
nop
#endif
/* Write the first and last 64 byte aligned block using stp rather
than using DC ZVA. This is faster on some cores.
*/
.p2align 4
L(zva_64):
str q0, [dst, 16]
stp q0, q0, [dst, 32]
@ -123,7 +123,6 @@ L(zva_64):
sub count, dstend, dst /* Count is now 128 too large. */
sub count, count, 128+64+64 /* Adjust count and bias for loop. */
add dst, dst, 128
nop
1: dc zva, dst
add dst, dst, 64
subs count, count, 64
@ -134,6 +133,7 @@ L(zva_64):
stp q0, q0, [dstend, -32]
ret
#ifndef ZVA64_ONLY
.p2align 3
L(zva_128):
cmp tmp1w, 5 /* ZVA size is 128 bytes. */

View File

@ -12,10 +12,10 @@ sysdep_routines += \
memmove_mops \
memset_a64fx \
memset_emag \
memset_falkor \
memset_generic \
memset_kunpeng \
memset_mops \
memset_zva64 \
strlen_asimd \
strlen_generic \
# sysdep_routines

View File

@ -59,9 +59,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove, mops, __memmove_mops)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
IFUNC_IMPL (i, name, memset,
/* Enable this on non-falkor processors too so that other cores
can do a comparative analysis with __memset_generic. */
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva64)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
#if HAVE_AARCH64_SVE_ASM

View File

@ -28,7 +28,7 @@
extern __typeof (__redirect_memset) __libc_memset;
extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
extern __typeof (__redirect_memset) __memset_zva64 attribute_hidden;
extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
@ -47,18 +47,17 @@ select_memset_ifunc (void)
{
if (IS_A64FX (midr) && zva_size == 256)
return __memset_a64fx;
return __memset_generic;
}
if (IS_KUNPENG920 (midr))
return __memset_kunpeng;
if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64)
return __memset_falkor;
if (IS_EMAG (midr))
return __memset_emag;
if (zva_size == 64)
return __memset_zva64;
return __memset_generic;
}

View File

@ -1,54 +0,0 @@
/* Memset for falkor.
Copyright (C) 2017-2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include <memset-reg.h>
/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to
use this function only when ZVA is enabled. */
#if IS_IN (libc)
.macro zva_macro
.p2align 4
/* Write the first and last 64 byte aligned block using stp rather
than using DC ZVA. This is faster on some cores. */
str q0, [dst, 16]
stp q0, q0, [dst, 32]
bic dst, dst, 63
stp q0, q0, [dst, 64]
stp q0, q0, [dst, 96]
sub count, dstend, dst /* Count is now 128 too large. */
sub count, count, 128+64+64 /* Adjust count and bias for loop. */
add dst, dst, 128
1: dc zva, dst
add dst, dst, 64
subs count, count, 64
b.hi 1b
stp q0, q0, [dst, 0]
stp q0, q0, [dst, 32]
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
.endm
# define ZVA_MACRO zva_macro
# define MEMSET __memset_falkor
# include <sysdeps/aarch64/memset.S>
#endif

View File

@ -0,0 +1,27 @@
/* Optimized memset for zva size = 64.
Copyright (C) 2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#define ZVA64_ONLY 1
#define MEMSET __memset_zva64
#undef libc_hidden_builtin_def
#define libc_hidden_builtin_def(X)
#include "../memset.S"