mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-09 14:50:05 +00:00
AArch64: Improve generic strlen
Improve performance by handling another 16 bytes before entering the loop. Use ADDHN in the loop to avoid SHRN+FMOV when it terminates. Change final size computation to avoid increasing latency. On Neoverse V1 performance of the random strlen benchmark improves by 4.6%. Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
This commit is contained in:
parent
d5ce0e960d
commit
3dc426b642
@ -1,4 +1,5 @@
|
|||||||
/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
|
/* Generic optimized strlen using SIMD.
|
||||||
|
Copyright (C) 2012-2024 Free Software Foundation, Inc.
|
||||||
|
|
||||||
This file is part of the GNU C Library.
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
@ -56,38 +57,52 @@ ENTRY (STRLEN)
|
|||||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||||
fmov synd, dend
|
fmov synd, dend
|
||||||
lsr synd, synd, shift
|
lsr synd, synd, shift
|
||||||
cbz synd, L(loop)
|
cbz synd, L(next16)
|
||||||
|
|
||||||
rbit synd, synd
|
rbit synd, synd
|
||||||
clz result, synd
|
clz result, synd
|
||||||
lsr result, result, 2
|
lsr result, result, 2
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.p2align 5
|
L(next16):
|
||||||
L(loop):
|
|
||||||
ldr data, [src, 16]
|
ldr data, [src, 16]
|
||||||
cmeq vhas_nul.16b, vdata.16b, 0
|
cmeq vhas_nul.16b, vdata.16b, 0
|
||||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||||
fmov synd, dend
|
|
||||||
cbnz synd, L(loop_end)
|
|
||||||
ldr data, [src, 32]!
|
|
||||||
cmeq vhas_nul.16b, vdata.16b, 0
|
|
||||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
|
||||||
fmov synd, dend
|
fmov synd, dend
|
||||||
cbz synd, L(loop)
|
cbz synd, L(loop)
|
||||||
sub src, src, 16
|
add src, src, 16
|
||||||
L(loop_end):
|
|
||||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
|
||||||
sub result, src, srcin
|
|
||||||
fmov synd, dend
|
|
||||||
#ifndef __AARCH64EB__
|
#ifndef __AARCH64EB__
|
||||||
rbit synd, synd
|
rbit synd, synd
|
||||||
#endif
|
#endif
|
||||||
add result, result, 16
|
sub result, src, srcin
|
||||||
clz tmp, synd
|
clz tmp, synd
|
||||||
add result, result, tmp, lsr 2
|
add result, result, tmp, lsr 2
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
.p2align 5
|
||||||
|
L(loop):
|
||||||
|
ldr data, [src, 32]!
|
||||||
|
cmeq vhas_nul.16b, vdata.16b, 0
|
||||||
|
addhn vend.8b, vhas_nul.8h, vhas_nul.8h
|
||||||
|
fmov synd, dend
|
||||||
|
cbnz synd, L(loop_end)
|
||||||
|
ldr data, [src, 16]
|
||||||
|
cmeq vhas_nul.16b, vdata.16b, 0
|
||||||
|
addhn vend.8b, vhas_nul.8h, vhas_nul.8h
|
||||||
|
fmov synd, dend
|
||||||
|
cbz synd, L(loop)
|
||||||
|
add src, src, 16
|
||||||
|
L(loop_end):
|
||||||
|
sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
|
||||||
|
#ifndef __AARCH64EB__
|
||||||
|
rbit synd, synd
|
||||||
|
sub result, result, 3
|
||||||
|
#endif
|
||||||
|
clz tmp, synd
|
||||||
|
sub result, tmp, result
|
||||||
|
lsr result, result, 2
|
||||||
|
ret
|
||||||
|
|
||||||
END (STRLEN)
|
END (STRLEN)
|
||||||
weak_alias (STRLEN, strlen)
|
weak_alias (STRLEN, strlen)
|
||||||
libc_hidden_builtin_def (strlen)
|
libc_hidden_builtin_def (strlen)
|
||||||
|
Loading…
Reference in New Issue
Block a user