From ad64510e5c74729108a02a6c22f03aa8ee07a8d3 Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Fri, 29 Jun 2018 22:45:59 +0530 Subject: [PATCH] aarch64,falkor: Use vector registers for memcpy Vector registers perform better than scalar register pairs for copying data so prefer them instead. This results in a time reduction of over 50% (i.e. 2x speed improvemnet) for some smaller sizes for memcpy-walk. Larger sizes show improvements of around 1% to 2%. memcpy-random shows a very small improvement, in the range of 1-2%. * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): Use vector registers. (cherry picked from commit 0aec4c1d1801e8016ebe89281d16597e0557b8be) --- ChangeLog | 5 + sysdeps/aarch64/multiarch/memcpy_falkor.S | 135 ++++++++++------------ 2 files changed, 69 insertions(+), 71 deletions(-) diff --git a/ChangeLog b/ChangeLog index 65b46ef409..0482b0c435 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): + Use vector registers. + 2019-09-06 Siddhesh Poyarekar * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S index 3b8601f87e..9cde8dcbd6 100644 --- a/sysdeps/aarch64/multiarch/memcpy_falkor.S +++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S @@ -29,25 +29,19 @@ #define dst x3 #define srcend x4 #define dstend x5 -#define A_l x6 -#define A_lw w6 -#define A_h x7 -#define A_hw w7 #define tmp1 x14 +#define A_x x6 +#define B_x x7 +#define A_w w6 +#define B_w w7 -#define B_l x8 -#define B_lw w8 -#define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l dst -#define E_h tmp1 -#define F_l src -#define F_h count -#define G_l srcend -#define G_h x15 +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 /* Copies are split into 3 main cases: @@ -67,9 +61,9 @@ bumping up the small copies up to 32 bytes allows us to do that without cost and also allows us to reduce the size of the prep code before loop64. - All copies are done only via two registers r6 and r7. This is to ensure - that all loads hit a single hardware prefetcher which can get correctly - trained to prefetch a single stream. + The copy loop uses only one register q0. This is to ensure that all loads + hit a single hardware prefetcher which can get correctly trained to prefetch + a single stream. The non-temporal stores help optimize cache utilization. */ @@ -80,29 +74,29 @@ ENTRY_ALIGN (__memcpy_falkor, 6) add srcend, src, count add dstend, dstin, count b.ls L(copy32) - ldp A_l, A_h, [src] + ldr A_q, [src] cmp count, 128 - stp A_l, A_h, [dstin] + str A_q, [dstin] b.hi L(copy_long) /* Medium copies: 33..128 bytes. */ sub tmp1, count, 1 - ldp A_l, A_h, [src, 16] - ldp B_l, B_h, [srcend, -32] - ldp C_l, C_h, [srcend, -16] + ldr A_q, [src, 16] + ldr B_q, [srcend, -32] + ldr C_q, [srcend, -16] tbz tmp1, 6, 1f - ldp D_l, D_h, [src, 32] - ldp E_l, E_h, [src, 48] - stp D_l, D_h, [dstin, 32] - stp E_l, E_h, [dstin, 48] - ldp F_l, F_h, [srcend, -64] - ldp G_l, G_h, [srcend, -48] - stp F_l, F_h, [dstend, -64] - stp G_l, G_h, [dstend, -48] + ldr D_q, [src, 32] + ldr E_q, [src, 48] + str D_q, [dstin, 32] + str E_q, [dstin, 48] + ldr F_q, [srcend, -64] + ldr G_q, [srcend, -48] + str F_q, [dstend, -64] + str G_q, [dstend, -48] 1: - stp A_l, A_h, [dstin, 16] - stp B_l, B_h, [dstend, -32] - stp C_l, C_h, [dstend, -16] + str A_q, [dstin, 16] + str B_q, [dstend, -32] + str C_q, [dstend, -16] ret .p2align 4 @@ -111,44 +105,44 @@ L(copy32): /* 16-32 */ cmp count, 16 b.lo 1f - ldp A_l, A_h, [src] - ldp B_l, B_h, [srcend, -16] - stp A_l, A_h, [dstin] - stp B_l, B_h, [dstend, -16] + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] ret .p2align 4 1: /* 8-15 */ tbz count, 3, 1f - ldr A_l, [src] - ldr B_l, [srcend, -8] - str A_l, [dstin] - str B_l, [dstend, -8] + ldr A_x, [src] + ldr B_x, [srcend, -8] + str A_x, [dstin] + str B_x, [dstend, -8] ret .p2align 4 1: /* 4-7 */ tbz count, 2, 1f - ldr A_lw, [src] - ldr B_lw, [srcend, -4] - str A_lw, [dstin] - str B_lw, [dstend, -4] + ldr A_w, [src] + ldr B_w, [srcend, -4] + str A_w, [dstin] + str B_w, [dstend, -4] ret .p2align 4 1: /* 2-3 */ tbz count, 1, 1f - ldrh A_lw, [src] - ldrh B_lw, [srcend, -2] - strh A_lw, [dstin] - strh B_lw, [dstend, -2] + ldrh A_w, [src] + ldrh B_w, [srcend, -2] + strh A_w, [dstin] + strh B_w, [dstend, -2] ret .p2align 4 1: /* 0-1 */ tbz count, 0, 1f - ldrb A_lw, [src] - strb A_lw, [dstin] + ldrb A_w, [src] + strb A_w, [dstin] 1: ret @@ -167,30 +161,29 @@ L(copy_long): add count, count, tmp1 L(loop64): - ldp A_l, A_h, [src, 16]! - stnp A_l, A_h, [dst, 16] - ldp A_l, A_h, [src, 16]! + ldr A_q, [src, 16]! + str A_q, [dst, 16] + ldr A_q, [src, 16]! subs count, count, 64 - stnp A_l, A_h, [dst, 32] - ldp A_l, A_h, [src, 16]! - stnp A_l, A_h, [dst, 48] - ldp A_l, A_h, [src, 16]! - stnp A_l, A_h, [dst, 64] - add dst, dst, 64 + str A_q, [dst, 32] + ldr A_q, [src, 16]! + str A_q, [dst, 48] + ldr A_q, [src, 16]! + str A_q, [dst, 64]! b.hi L(loop64) /* Write the last full set of 64 bytes. The remainder is at most 64 bytes, so it is safe to always copy 64 bytes from the end even if there is just 1 byte left. */ L(last64): - ldp A_l, A_h, [srcend, -64] - stnp A_l, A_h, [dstend, -64] - ldp B_l, B_h, [srcend, -48] - stnp B_l, B_h, [dstend, -48] - ldp C_l, C_h, [srcend, -32] - stnp C_l, C_h, [dstend, -32] - ldp D_l, D_h, [srcend, -16] - stnp D_l, D_h, [dstend, -16] + ldr E_q, [srcend, -64] + str E_q, [dstend, -64] + ldr D_q, [srcend, -48] + str D_q, [dstend, -48] + ldr C_q, [srcend, -32] + str C_q, [dstend, -32] + ldr B_q, [srcend, -16] + str B_q, [dstend, -16] ret END (__memcpy_falkor)