AArch64: Improve generic strlen

Improve performance by handling another 16 bytes before entering the loop.
Use ADDHN in the loop to avoid SHRN+FMOV when it terminates.  Change final
size computation to avoid increasing latency.  On Neoverse V1 performance
of the random strlen benchmark improves by 4.6%.

Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
This commit is contained in:
Wilco Dijkstra 2024-08-07 14:43:47 +01:00
parent d5ce0e960d
commit 3dc426b642

View File

@ -1,4 +1,5 @@
/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
/* Generic optimized strlen using SIMD.
Copyright (C) 2012-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@ -56,38 +57,52 @@ ENTRY (STRLEN)
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(loop)
cbz synd, L(next16)
rbit synd, synd
clz result, synd
lsr result, result, 2
ret
.p2align 5
L(loop):
L(next16):
ldr data, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbnz synd, L(loop_end)
ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
cbz synd, L(loop)
sub src, src, 16
L(loop_end):
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
sub result, src, srcin
fmov synd, dend
add src, src, 16
#ifndef __AARCH64EB__
rbit synd, synd
#endif
add result, result, 16
sub result, src, srcin
clz tmp, synd
add result, result, tmp, lsr 2
ret
.p2align 5
L(loop):
ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbnz synd, L(loop_end)
ldr data, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbz synd, L(loop)
add src, src, 16
L(loop_end):
sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
#ifndef __AARCH64EB__
rbit synd, synd
sub result, result, 3
#endif
clz tmp, synd
sub result, tmp, result
lsr result, result, 2
ret
END (STRLEN)
weak_alias (STRLEN, strlen)
libc_hidden_builtin_def (strlen)