mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-12 14:20:13 +00:00
AArch64: Optimize strcpy
Unroll the main loop. Large strings are around 20% faster on modern CPUs.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
(cherry picked from commit 349e48c01e
)
This commit is contained in:
parent
b2cf48dd84
commit
20a2b9dc71
@ -30,7 +30,6 @@
|
|||||||
* MTE compatible.
|
* MTE compatible.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* Arguments and results. */
|
|
||||||
#define dstin x0
|
#define dstin x0
|
||||||
#define srcin x1
|
#define srcin x1
|
||||||
#define result x0
|
#define result x0
|
||||||
@ -76,14 +75,14 @@ ENTRY (STRCPY)
|
|||||||
ld1 {vdata.16b}, [src]
|
ld1 {vdata.16b}, [src]
|
||||||
cmeq vhas_nul.16b, vdata.16b, 0
|
cmeq vhas_nul.16b, vdata.16b, 0
|
||||||
lsl shift, srcin, 2
|
lsl shift, srcin, 2
|
||||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
shrn vend.8b, vhas_nul.8h, 4
|
||||||
fmov synd, dend
|
fmov synd, dend
|
||||||
lsr synd, synd, shift
|
lsr synd, synd, shift
|
||||||
cbnz synd, L(tail)
|
cbnz synd, L(tail)
|
||||||
|
|
||||||
ldr dataq, [src, 16]!
|
ldr dataq, [src, 16]!
|
||||||
cmeq vhas_nul.16b, vdata.16b, 0
|
cmeq vhas_nul.16b, vdata.16b, 0
|
||||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
shrn vend.8b, vhas_nul.8h, 4
|
||||||
fmov synd, dend
|
fmov synd, dend
|
||||||
cbz synd, L(start_loop)
|
cbz synd, L(start_loop)
|
||||||
|
|
||||||
@ -102,13 +101,10 @@ ENTRY (STRCPY)
|
|||||||
IFSTPCPY (add result, dstin, len)
|
IFSTPCPY (add result, dstin, len)
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.p2align 4,,8
|
|
||||||
L(tail):
|
L(tail):
|
||||||
rbit synd, synd
|
rbit synd, synd
|
||||||
clz len, synd
|
clz len, synd
|
||||||
lsr len, len, 2
|
lsr len, len, 2
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(less16):
|
L(less16):
|
||||||
tbz len, 3, L(less8)
|
tbz len, 3, L(less8)
|
||||||
sub tmp, len, 7
|
sub tmp, len, 7
|
||||||
@ -141,31 +137,37 @@ L(zerobyte):
|
|||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
L(start_loop):
|
L(start_loop):
|
||||||
sub len, src, srcin
|
sub tmp, srcin, dstin
|
||||||
ldr dataq2, [srcin]
|
ldr dataq2, [srcin]
|
||||||
add dst, dstin, len
|
sub dst, src, tmp
|
||||||
str dataq2, [dstin]
|
str dataq2, [dstin]
|
||||||
|
|
||||||
.p2align 5
|
|
||||||
L(loop):
|
L(loop):
|
||||||
str dataq, [dst], 16
|
str dataq, [dst], 32
|
||||||
ldr dataq, [src, 16]!
|
ldr dataq, [src, 16]
|
||||||
|
cmeq vhas_nul.16b, vdata.16b, 0
|
||||||
|
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||||
|
fmov synd, dend
|
||||||
|
cbnz synd, L(loopend)
|
||||||
|
str dataq, [dst, -16]
|
||||||
|
ldr dataq, [src, 32]!
|
||||||
cmeq vhas_nul.16b, vdata.16b, 0
|
cmeq vhas_nul.16b, vdata.16b, 0
|
||||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||||
fmov synd, dend
|
fmov synd, dend
|
||||||
cbz synd, L(loop)
|
cbz synd, L(loop)
|
||||||
|
add dst, dst, 16
|
||||||
|
L(loopend):
|
||||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||||
fmov synd, dend
|
fmov synd, dend
|
||||||
|
sub dst, dst, 31
|
||||||
#ifndef __AARCH64EB__
|
#ifndef __AARCH64EB__
|
||||||
rbit synd, synd
|
rbit synd, synd
|
||||||
#endif
|
#endif
|
||||||
clz len, synd
|
clz len, synd
|
||||||
lsr len, len, 2
|
lsr len, len, 2
|
||||||
sub tmp, len, 15
|
add dst, dst, len
|
||||||
ldr dataq, [src, tmp]
|
ldr dataq, [dst, tmp]
|
||||||
str dataq, [dst, tmp]
|
str dataq, [dst]
|
||||||
IFSTPCPY (add result, dst, len)
|
IFSTPCPY (add result, dst, 15)
|
||||||
ret
|
ret
|
||||||
|
|
||||||
END (STRCPY)
|
END (STRCPY)
|
||||||
|
Loading…
Reference in New Issue
Block a user