mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-24 03:31:07 +00:00
aarch64: Optimized memset for Kunpeng processor.
Due to the branch prediction issue of Kunpeng processor, we found memset_generic has poor performance on middle sizes setting, and so we reconstructed the logic, expanded the loop by 4 times in set_long to solve the problem, even when setting below 1K sizes have benefit. Another change is that DZ_ZVA seems no work when setting zero, so we discarded it and used set_long to set zero instead. Fewer branches and predictions also make the zero case have slightly improvement. Checked on aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
This commit is contained in:
parent
c2150769d0
commit
525de033a9
@ -1,7 +1,7 @@
|
||||
ifeq ($(subdir),string)
|
||||
sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
|
||||
memcpy_falkor memmove_falkor \
|
||||
memset_generic memset_falkor memset_emag \
|
||||
memset_generic memset_falkor memset_emag memset_kunpeng \
|
||||
memchr_generic memchr_nosimd \
|
||||
strlen_generic strlen_asimd
|
||||
endif
|
||||
|
@ -53,6 +53,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
can do a comparative analysis with __memset_generic. */
|
||||
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
|
||||
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag)
|
||||
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
|
||||
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
|
||||
IFUNC_IMPL (i, name, memchr,
|
||||
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_nosimd)
|
||||
|
@ -30,10 +30,13 @@ extern __typeof (__redirect_memset) __libc_memset;
|
||||
|
||||
extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
|
||||
extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
|
||||
extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
|
||||
extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
|
||||
|
||||
libc_ifunc (__libc_memset,
|
||||
((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
|
||||
IS_KUNPENG (midr)
|
||||
?__memset_kunpeng
|
||||
: ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
|
||||
? __memset_falkor
|
||||
: (IS_EMAG (midr) && zva_size == 64
|
||||
? __memset_emag
|
||||
|
113
sysdeps/aarch64/multiarch/memset_kunpeng.S
Normal file
113
sysdeps/aarch64/multiarch/memset_kunpeng.S
Normal file
@ -0,0 +1,113 @@
|
||||
/* Optimized memset for Huawei Kunpeng processor.
|
||||
Copyright (C) 2012-2019 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library. If not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include <sysdeps/aarch64/memset-reg.h>
|
||||
|
||||
#if IS_IN (libc)
|
||||
# define MEMSET __memset_kunpeng
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, unaligned accesses
|
||||
*
|
||||
*/
|
||||
|
||||
ENTRY_ALIGN (MEMSET, 6)
|
||||
|
||||
DELOUSE (0)
|
||||
DELOUSE (2)
|
||||
|
||||
dup v0.16B, valw
|
||||
add dstend, dstin, count
|
||||
|
||||
cmp count, 128
|
||||
b.hs L(set_long)
|
||||
|
||||
cmp count, 16
|
||||
b.lo L(less16)
|
||||
|
||||
/* Set 16..127 bytes. */
|
||||
str q0, [dstin]
|
||||
tbnz count, 6, L(set127)
|
||||
str q0, [dstend, -16]
|
||||
tbz count, 5, 1f
|
||||
str q0, [dstin, 16]
|
||||
str q0, [dstend, -32]
|
||||
1: ret
|
||||
|
||||
.p2align 4
|
||||
/* Set 64..127 bytes. Write 64 bytes from the start and
|
||||
64 bytes from the end. */
|
||||
L(set127):
|
||||
stp q0, q0, [dstin, 16]
|
||||
str q0, [dstin, 48]
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Set 0..15 bytes. */
|
||||
L(less16):
|
||||
tbz count, 3, L(less8)
|
||||
str d0, [dstin]
|
||||
str d0, [dstend, -8]
|
||||
ret
|
||||
L(less8):
|
||||
tbz count, 2, 2f
|
||||
str s0, [dstin]
|
||||
str s0, [dstend, -4]
|
||||
ret
|
||||
2: cbz count, 3f
|
||||
str b0, [dstin]
|
||||
tbz count, 1, 3f
|
||||
str h0, [dstend, -2]
|
||||
3: ret
|
||||
|
||||
.p2align 4
|
||||
L(set_long):
|
||||
bic dst, dstin, 15
|
||||
str q0, [dstin]
|
||||
sub count, dstend, dst /* Count is 16 too large. */
|
||||
sub dst, dst, 16 /* Dst is biased by -32. */
|
||||
sub count, count, 64 + 16 + 1 /* Adjust count and bias for loop. */
|
||||
1: stp q0, q0, [dst, 32]
|
||||
stp q0, q0, [dst, 64]!
|
||||
subs count, count, 64
|
||||
b.lo 1f
|
||||
stp q0, q0, [dst, 32]
|
||||
stp q0, q0, [dst, 64]!
|
||||
subs count, count, 64
|
||||
b.lo 1f
|
||||
stp q0, q0, [dst, 32]
|
||||
stp q0, q0, [dst, 64]!
|
||||
subs count, count, 64
|
||||
b.lo 1f
|
||||
stp q0, q0, [dst, 32]
|
||||
stp q0, q0, [dst, 64]!
|
||||
subs count, count, 64
|
||||
b.hs 1b
|
||||
|
||||
1: stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
END (MEMSET)
|
||||
libc_hidden_builtin_def (MEMSET)
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user