AArch64: Improve generic strlen

Improve performance by handling another 16 bytes before entering the loop.
Use ADDHN in the loop to avoid SHRN+FMOV when it terminates.  Change final
size computation to avoid increasing latency.  On Neoverse V1 performance
of the random strlen benchmark improves by 4.6%.

Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
This commit is contained in:
Wilco Dijkstra 2024-08-07 14:43:47 +01:00
parent d5ce0e960d
commit 3dc426b642

View File

@ -1,4 +1,5 @@
/* Copyright (C) 2012-2024 Free Software Foundation, Inc. /* Generic optimized strlen using SIMD.
Copyright (C) 2012-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library. This file is part of the GNU C Library.
@ -56,38 +57,52 @@ ENTRY (STRLEN)
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend fmov synd, dend
lsr synd, synd, shift lsr synd, synd, shift
cbz synd, L(loop) cbz synd, L(next16)
rbit synd, synd rbit synd, synd
clz result, synd clz result, synd
lsr result, result, 2 lsr result, result, 2
ret ret
.p2align 5 L(next16):
L(loop):
ldr data, [src, 16] ldr data, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
cbnz synd, L(loop_end)
ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend fmov synd, dend
cbz synd, L(loop) cbz synd, L(loop)
sub src, src, 16 add src, src, 16
L(loop_end):
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
sub result, src, srcin
fmov synd, dend
#ifndef __AARCH64EB__ #ifndef __AARCH64EB__
rbit synd, synd rbit synd, synd
#endif #endif
add result, result, 16 sub result, src, srcin
clz tmp, synd clz tmp, synd
add result, result, tmp, lsr 2 add result, result, tmp, lsr 2
ret ret
.p2align 5
L(loop):
ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbnz synd, L(loop_end)
ldr data, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbz synd, L(loop)
add src, src, 16
L(loop_end):
sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
#ifndef __AARCH64EB__
rbit synd, synd
sub result, result, 3
#endif
clz tmp, synd
sub result, tmp, result
lsr result, result, 2
ret
END (STRLEN) END (STRLEN)
weak_alias (STRLEN, strlen) weak_alias (STRLEN, strlen)
libc_hidden_builtin_def (strlen) libc_hidden_builtin_def (strlen)