mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-23 05:20:06 +00:00
aarch64: Remove non-temporal load/stores from oryon-1's memset
The hardware architects have a new recommendation not to use non-temporal load/stores for memset. This patch removes this path. I found there was no difference in the memset speed with/without non-temporal load/stores either. Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
This commit is contained in:
parent
eb5eeb4740
commit
e6590f0c86
@ -93,8 +93,6 @@ L(set_long):
|
||||
cmp count, 256
|
||||
ccmp valw, 0, 0, cs
|
||||
b.eq L(try_zva)
|
||||
cmp count, #32768
|
||||
b.hi L(set_long_with_nontemp)
|
||||
/* Small-size or non-zero memset does not use DC ZVA. */
|
||||
sub count, dstend, dst
|
||||
|
||||
@ -117,30 +115,6 @@ L(set_long):
|
||||
stp val, val, [dstend, -16]
|
||||
ret
|
||||
|
||||
L(set_long_with_nontemp):
|
||||
/* Small-size or non-zero memset does not use DC ZVA. */
|
||||
sub count, dstend, dst
|
||||
|
||||
/* Adjust count and bias for loop. By subtracting extra 1 from count,
|
||||
it is easy to use tbz instruction to check whether loop tailing
|
||||
count is less than 33 bytes, so as to bypass 2 unnecessary stps. */
|
||||
sub count, count, 64+16+1
|
||||
|
||||
1: stnp val, val, [dst, 16]
|
||||
stnp val, val, [dst, 32]
|
||||
stnp val, val, [dst, 48]
|
||||
stnp val, val, [dst, 64]
|
||||
add dst, dst, #64
|
||||
subs count, count, 64
|
||||
b.hs 1b
|
||||
|
||||
tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
|
||||
stnp val, val, [dst, 16]
|
||||
stnp val, val, [dst, 32]
|
||||
1: stnp val, val, [dstend, -32]
|
||||
stnp val, val, [dstend, -16]
|
||||
ret
|
||||
|
||||
L(try_zva):
|
||||
/* Write the first and last 64 byte aligned block using stp rather
|
||||
than using DC ZVA as it is faster. */
|
||||
|
Loading…
Reference in New Issue
Block a user