mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-11 05:40:06 +00:00
AArch64: Optimize strcpy
Unroll the main loop. Large strings are around 20% faster on modern CPUs.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
(cherry picked from commit 349e48c01e
)
This commit is contained in:
parent
b2cf48dd84
commit
20a2b9dc71
@ -30,7 +30,6 @@
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
/* Arguments and results. */
|
||||
#define dstin x0
|
||||
#define srcin x1
|
||||
#define result x0
|
||||
@ -76,14 +75,14 @@ ENTRY (STRCPY)
|
||||
ld1 {vdata.16b}, [src]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
shrn vend.8b, vhas_nul.8h, 4
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbnz synd, L(tail)
|
||||
|
||||
ldr dataq, [src, 16]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
shrn vend.8b, vhas_nul.8h, 4
|
||||
fmov synd, dend
|
||||
cbz synd, L(start_loop)
|
||||
|
||||
@ -102,13 +101,10 @@ ENTRY (STRCPY)
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
.p2align 4,,8
|
||||
L(tail):
|
||||
rbit synd, synd
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
|
||||
.p2align 4
|
||||
L(less16):
|
||||
tbz len, 3, L(less8)
|
||||
sub tmp, len, 7
|
||||
@ -141,31 +137,37 @@ L(zerobyte):
|
||||
|
||||
.p2align 4
|
||||
L(start_loop):
|
||||
sub len, src, srcin
|
||||
sub tmp, srcin, dstin
|
||||
ldr dataq2, [srcin]
|
||||
add dst, dstin, len
|
||||
sub dst, src, tmp
|
||||
str dataq2, [dstin]
|
||||
|
||||
.p2align 5
|
||||
L(loop):
|
||||
str dataq, [dst], 16
|
||||
ldr dataq, [src, 16]!
|
||||
str dataq, [dst], 32
|
||||
ldr dataq, [src, 16]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbnz synd, L(loopend)
|
||||
str dataq, [dst, -16]
|
||||
ldr dataq, [src, 32]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop)
|
||||
|
||||
add dst, dst, 16
|
||||
L(loopend):
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
sub dst, dst, 31
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
sub tmp, len, 15
|
||||
ldr dataq, [src, tmp]
|
||||
str dataq, [dst, tmp]
|
||||
IFSTPCPY (add result, dst, len)
|
||||
add dst, dst, len
|
||||
ldr dataq, [dst, tmp]
|
||||
str dataq, [dst]
|
||||
IFSTPCPY (add result, dst, 15)
|
||||
ret
|
||||
|
||||
END (STRCPY)
|
||||
|
Loading…
Reference in New Issue
Block a user