aarch64,falkor: Use vector registers for memcpy

Vector registers perform better than scalar register pairs for copying
data so prefer them instead.  This results in a time reduction of over
50% (i.e. 2x speed improvemnet) for some smaller sizes for memcpy-walk.
Larger sizes show improvements of around 1% to 2%.  memcpy-random shows
a very small improvement, in the range of 1-2%.

	* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
	Use vector registers.

(cherry picked from commit 0aec4c1d18)
This commit is contained in:
Siddhesh Poyarekar 2018-06-29 22:45:59 +05:30 committed by Wilco Dijkstra
parent d3c05bfffa
commit ad64510e5c
2 changed files with 69 additions and 71 deletions

View File

@ -1,3 +1,8 @@
2019-09-06 Siddhesh Poyarekar <siddhesh@sourceware.org>
* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
Use vector registers.
2019-09-06 Siddhesh Poyarekar <siddhesh@sourceware.org> 2019-09-06 Siddhesh Poyarekar <siddhesh@sourceware.org>
* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):

View File

@ -29,25 +29,19 @@
#define dst x3 #define dst x3
#define srcend x4 #define srcend x4
#define dstend x5 #define dstend x5
#define A_l x6
#define A_lw w6
#define A_h x7
#define A_hw w7
#define tmp1 x14 #define tmp1 x14
#define A_x x6
#define B_x x7
#define A_w w6
#define B_w w7
#define B_l x8 #define A_q q0
#define B_lw w8 #define B_q q1
#define B_h x9 #define C_q q2
#define C_l x10 #define D_q q3
#define C_h x11 #define E_q q4
#define D_l x12 #define F_q q5
#define D_h x13 #define G_q q6
#define E_l dst
#define E_h tmp1
#define F_l src
#define F_h count
#define G_l srcend
#define G_h x15
/* Copies are split into 3 main cases: /* Copies are split into 3 main cases:
@ -67,9 +61,9 @@
bumping up the small copies up to 32 bytes allows us to do that without bumping up the small copies up to 32 bytes allows us to do that without
cost and also allows us to reduce the size of the prep code before loop64. cost and also allows us to reduce the size of the prep code before loop64.
All copies are done only via two registers r6 and r7. This is to ensure The copy loop uses only one register q0. This is to ensure that all loads
that all loads hit a single hardware prefetcher which can get correctly hit a single hardware prefetcher which can get correctly trained to prefetch
trained to prefetch a single stream. a single stream.
The non-temporal stores help optimize cache utilization. */ The non-temporal stores help optimize cache utilization. */
@ -80,29 +74,29 @@ ENTRY_ALIGN (__memcpy_falkor, 6)
add srcend, src, count add srcend, src, count
add dstend, dstin, count add dstend, dstin, count
b.ls L(copy32) b.ls L(copy32)
ldp A_l, A_h, [src] ldr A_q, [src]
cmp count, 128 cmp count, 128
stp A_l, A_h, [dstin] str A_q, [dstin]
b.hi L(copy_long) b.hi L(copy_long)
/* Medium copies: 33..128 bytes. */ /* Medium copies: 33..128 bytes. */
sub tmp1, count, 1 sub tmp1, count, 1
ldp A_l, A_h, [src, 16] ldr A_q, [src, 16]
ldp B_l, B_h, [srcend, -32] ldr B_q, [srcend, -32]
ldp C_l, C_h, [srcend, -16] ldr C_q, [srcend, -16]
tbz tmp1, 6, 1f tbz tmp1, 6, 1f
ldp D_l, D_h, [src, 32] ldr D_q, [src, 32]
ldp E_l, E_h, [src, 48] ldr E_q, [src, 48]
stp D_l, D_h, [dstin, 32] str D_q, [dstin, 32]
stp E_l, E_h, [dstin, 48] str E_q, [dstin, 48]
ldp F_l, F_h, [srcend, -64] ldr F_q, [srcend, -64]
ldp G_l, G_h, [srcend, -48] ldr G_q, [srcend, -48]
stp F_l, F_h, [dstend, -64] str F_q, [dstend, -64]
stp G_l, G_h, [dstend, -48] str G_q, [dstend, -48]
1: 1:
stp A_l, A_h, [dstin, 16] str A_q, [dstin, 16]
stp B_l, B_h, [dstend, -32] str B_q, [dstend, -32]
stp C_l, C_h, [dstend, -16] str C_q, [dstend, -16]
ret ret
.p2align 4 .p2align 4
@ -111,44 +105,44 @@ L(copy32):
/* 16-32 */ /* 16-32 */
cmp count, 16 cmp count, 16
b.lo 1f b.lo 1f
ldp A_l, A_h, [src] ldr A_q, [src]
ldp B_l, B_h, [srcend, -16] ldr B_q, [srcend, -16]
stp A_l, A_h, [dstin] str A_q, [dstin]
stp B_l, B_h, [dstend, -16] str B_q, [dstend, -16]
ret ret
.p2align 4 .p2align 4
1: 1:
/* 8-15 */ /* 8-15 */
tbz count, 3, 1f tbz count, 3, 1f
ldr A_l, [src] ldr A_x, [src]
ldr B_l, [srcend, -8] ldr B_x, [srcend, -8]
str A_l, [dstin] str A_x, [dstin]
str B_l, [dstend, -8] str B_x, [dstend, -8]
ret ret
.p2align 4 .p2align 4
1: 1:
/* 4-7 */ /* 4-7 */
tbz count, 2, 1f tbz count, 2, 1f
ldr A_lw, [src] ldr A_w, [src]
ldr B_lw, [srcend, -4] ldr B_w, [srcend, -4]
str A_lw, [dstin] str A_w, [dstin]
str B_lw, [dstend, -4] str B_w, [dstend, -4]
ret ret
.p2align 4 .p2align 4
1: 1:
/* 2-3 */ /* 2-3 */
tbz count, 1, 1f tbz count, 1, 1f
ldrh A_lw, [src] ldrh A_w, [src]
ldrh B_lw, [srcend, -2] ldrh B_w, [srcend, -2]
strh A_lw, [dstin] strh A_w, [dstin]
strh B_lw, [dstend, -2] strh B_w, [dstend, -2]
ret ret
.p2align 4 .p2align 4
1: 1:
/* 0-1 */ /* 0-1 */
tbz count, 0, 1f tbz count, 0, 1f
ldrb A_lw, [src] ldrb A_w, [src]
strb A_lw, [dstin] strb A_w, [dstin]
1: 1:
ret ret
@ -167,30 +161,29 @@ L(copy_long):
add count, count, tmp1 add count, count, tmp1
L(loop64): L(loop64):
ldp A_l, A_h, [src, 16]! ldr A_q, [src, 16]!
stnp A_l, A_h, [dst, 16] str A_q, [dst, 16]
ldp A_l, A_h, [src, 16]! ldr A_q, [src, 16]!
subs count, count, 64 subs count, count, 64
stnp A_l, A_h, [dst, 32] str A_q, [dst, 32]
ldp A_l, A_h, [src, 16]! ldr A_q, [src, 16]!
stnp A_l, A_h, [dst, 48] str A_q, [dst, 48]
ldp A_l, A_h, [src, 16]! ldr A_q, [src, 16]!
stnp A_l, A_h, [dst, 64] str A_q, [dst, 64]!
add dst, dst, 64
b.hi L(loop64) b.hi L(loop64)
/* Write the last full set of 64 bytes. The remainder is at most 64 /* Write the last full set of 64 bytes. The remainder is at most 64
bytes, so it is safe to always copy 64 bytes from the end even if bytes, so it is safe to always copy 64 bytes from the end even if
there is just 1 byte left. */ there is just 1 byte left. */
L(last64): L(last64):
ldp A_l, A_h, [srcend, -64] ldr E_q, [srcend, -64]
stnp A_l, A_h, [dstend, -64] str E_q, [dstend, -64]
ldp B_l, B_h, [srcend, -48] ldr D_q, [srcend, -48]
stnp B_l, B_h, [dstend, -48] str D_q, [dstend, -48]
ldp C_l, C_h, [srcend, -32] ldr C_q, [srcend, -32]
stnp C_l, C_h, [dstend, -32] str C_q, [dstend, -32]
ldp D_l, D_h, [srcend, -16] ldr B_q, [srcend, -16]
stnp D_l, D_h, [dstend, -16] str B_q, [dstend, -16]
ret ret
END (__memcpy_falkor) END (__memcpy_falkor)