mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-22 13:00:06 +00:00
aarch64,falkor: Use vector registers for memmove
Vector registers perform much better for moves compared to pairs of registers on falkor, so use them instead. This results in a time reduction of up to 50% (i.e. 2x improvement) for a lot of the smaller sizes, i.e. up to 1K in memmove-walk. Improvements for larger sizes are smaller, at about 1%-2%. * sysdeps/aarch64/multiarch/memmove_falkor.S (__memcpy_falkor): Use vector registers.
This commit is contained in:
parent
7e8989d03b
commit
ce76a5cb8d
@ -1,3 +1,8 @@
|
|||||||
|
2018-06-29 Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||||
|
|
||||||
|
* sysdeps/aarch64/multiarch/memmove_falkor.S
|
||||||
|
(__memcpy_falkor): Use vector registers.
|
||||||
|
|
||||||
2018-06-29 Martin Sebor <msebor@redhat.com>
|
2018-06-29 Martin Sebor <msebor@redhat.com>
|
||||||
|
|
||||||
* manual/stdio.texi (Customizing Printf): Mention interaction
|
* manual/stdio.texi (Customizing Printf): Mention interaction
|
||||||
|
@ -23,44 +23,36 @@
|
|||||||
#define dstin x0
|
#define dstin x0
|
||||||
#define src x1
|
#define src x1
|
||||||
#define count x2
|
#define count x2
|
||||||
#define dstlen x3
|
|
||||||
#define dst x3
|
#define dst x3
|
||||||
#define srcend x4
|
#define srcend x4
|
||||||
#define dstend x5
|
#define dstend x5
|
||||||
#define A_l x6
|
#define A_x x6
|
||||||
#define A_lw w6
|
#define B_x x7
|
||||||
#define A_h x7
|
#define A_w w6
|
||||||
#define A_hw w7
|
#define B_w w7
|
||||||
#define B_l x8
|
|
||||||
#define B_lw w8
|
|
||||||
#define B_h x9
|
|
||||||
#define C_l x10
|
|
||||||
#define C_h x11
|
|
||||||
#define D_l x12
|
|
||||||
#define D_h x13
|
|
||||||
#define E_l src
|
|
||||||
#define E_h count
|
|
||||||
#define F_l srcend
|
|
||||||
#define F_h dst
|
|
||||||
#define tmp1 x14
|
#define tmp1 x14
|
||||||
|
|
||||||
/* Alias with A_l and A_h to train the prefetcher. */
|
#define Q_q q6
|
||||||
#define Q_l x22
|
#define A_q q22
|
||||||
#define Q_h x23
|
#define B_q q18
|
||||||
|
#define C_q q19
|
||||||
|
#define D_q q20
|
||||||
|
#define E_q q21
|
||||||
|
#define F_q q17
|
||||||
|
#define G_q q23
|
||||||
|
|
||||||
/* RATIONALE:
|
/* RATIONALE:
|
||||||
|
|
||||||
The copy has 4 distinct parts:
|
The move has 4 distinct parts:
|
||||||
* Small copies of 16 bytes and under
|
* Small moves of 16 bytes and under
|
||||||
* Medium sized copies of 17-96 bytes
|
* Medium sized moves of 17-96 bytes
|
||||||
* Large copies where the source address is higher than the destination
|
* Large moves where the source address is higher than the destination
|
||||||
(forward copies)
|
(forward copies)
|
||||||
* Large copies where the destination address is higher than the source
|
* Large moves where the destination address is higher than the source
|
||||||
(copy backward, or move).
|
(copy backward, or move).
|
||||||
|
|
||||||
We use only two registerpairs x6,x7 and x22,x23 for the copies and copy 32
|
We use only two registers q6 and q22 for the moves and move 32 bytes at a
|
||||||
bytes at a time to correctly train the hardware prefetcher for better
|
time to correctly train the hardware prefetcher for better throughput. */
|
||||||
throughput. */
|
|
||||||
ENTRY_ALIGN (__memmove_falkor, 6)
|
ENTRY_ALIGN (__memmove_falkor, 6)
|
||||||
|
|
||||||
sub tmp1, dstin, src
|
sub tmp1, dstin, src
|
||||||
@ -77,17 +69,17 @@ ENTRY_ALIGN (__memmove_falkor, 6)
|
|||||||
|
|
||||||
/* Medium copies: 17..96 bytes. */
|
/* Medium copies: 17..96 bytes. */
|
||||||
sub tmp1, count, 1
|
sub tmp1, count, 1
|
||||||
ldp A_l, A_h, [src]
|
ldr A_q, [src]
|
||||||
tbnz tmp1, 6, L(copy96)
|
tbnz tmp1, 6, L(copy96)
|
||||||
ldp D_l, D_h, [srcend, -16]
|
ldr D_q, [srcend, -16]
|
||||||
tbz tmp1, 5, 1f
|
tbz tmp1, 5, 1f
|
||||||
ldp B_l, B_h, [src, 16]
|
ldr B_q, [src, 16]
|
||||||
ldp C_l, C_h, [srcend, -32]
|
ldr C_q, [srcend, -32]
|
||||||
stp B_l, B_h, [dstin, 16]
|
str B_q, [dstin, 16]
|
||||||
stp C_l, C_h, [dstend, -32]
|
str C_q, [dstend, -32]
|
||||||
1:
|
1:
|
||||||
stp A_l, A_h, [dstin]
|
str A_q, [dstin]
|
||||||
stp D_l, D_h, [dstend, -16]
|
str D_q, [dstend, -16]
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
@ -95,52 +87,52 @@ ENTRY_ALIGN (__memmove_falkor, 6)
|
|||||||
L(copy16):
|
L(copy16):
|
||||||
cmp count, 8
|
cmp count, 8
|
||||||
b.lo 1f
|
b.lo 1f
|
||||||
ldr A_l, [src]
|
ldr A_x, [src]
|
||||||
ldr A_h, [srcend, -8]
|
ldr B_x, [srcend, -8]
|
||||||
str A_l, [dstin]
|
str A_x, [dstin]
|
||||||
str A_h, [dstend, -8]
|
str B_x, [dstend, -8]
|
||||||
ret
|
ret
|
||||||
.p2align 4
|
.p2align 4
|
||||||
1:
|
1:
|
||||||
/* 4-7 */
|
/* 4-7 */
|
||||||
tbz count, 2, 1f
|
tbz count, 2, 1f
|
||||||
ldr A_lw, [src]
|
ldr A_w, [src]
|
||||||
ldr A_hw, [srcend, -4]
|
ldr B_w, [srcend, -4]
|
||||||
str A_lw, [dstin]
|
str A_w, [dstin]
|
||||||
str A_hw, [dstend, -4]
|
str B_w, [dstend, -4]
|
||||||
ret
|
ret
|
||||||
.p2align 4
|
.p2align 4
|
||||||
1:
|
1:
|
||||||
/* 2-3 */
|
/* 2-3 */
|
||||||
tbz count, 1, 1f
|
tbz count, 1, 1f
|
||||||
ldrh A_lw, [src]
|
ldrh A_w, [src]
|
||||||
ldrh A_hw, [srcend, -2]
|
ldrh B_w, [srcend, -2]
|
||||||
strh A_lw, [dstin]
|
strh A_w, [dstin]
|
||||||
strh A_hw, [dstend, -2]
|
strh B_w, [dstend, -2]
|
||||||
ret
|
ret
|
||||||
.p2align 4
|
.p2align 4
|
||||||
1:
|
1:
|
||||||
/* 0-1 */
|
/* 0-1 */
|
||||||
tbz count, 0, 1f
|
tbz count, 0, 1f
|
||||||
ldrb A_lw, [src]
|
ldrb A_w, [src]
|
||||||
strb A_lw, [dstin]
|
strb A_w, [dstin]
|
||||||
1: ret
|
1: ret
|
||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
/* Copy 64..96 bytes. Copy 64 bytes from the start and
|
/* Copy 64..96 bytes. Copy 64 bytes from the start and
|
||||||
32 bytes from the end. */
|
32 bytes from the end. */
|
||||||
L(copy96):
|
L(copy96):
|
||||||
ldp B_l, B_h, [src, 16]
|
ldr B_q, [src, 16]
|
||||||
ldp C_l, C_h, [src, 32]
|
ldr C_q, [src, 32]
|
||||||
ldp D_l, D_h, [src, 48]
|
ldr D_q, [src, 48]
|
||||||
ldp E_l, E_h, [srcend, -32]
|
ldr E_q, [srcend, -32]
|
||||||
ldp F_l, F_h, [srcend, -16]
|
ldr F_q, [srcend, -16]
|
||||||
stp A_l, A_h, [dstin]
|
str A_q, [dstin]
|
||||||
stp B_l, B_h, [dstin, 16]
|
str B_q, [dstin, 16]
|
||||||
stp C_l, C_h, [dstin, 32]
|
str C_q, [dstin, 32]
|
||||||
stp D_l, D_h, [dstin, 48]
|
str D_q, [dstin, 48]
|
||||||
stp E_l, E_h, [dstend, -32]
|
str E_q, [dstend, -32]
|
||||||
stp F_l, F_h, [dstend, -16]
|
str F_q, [dstend, -16]
|
||||||
ret
|
ret
|
||||||
|
|
||||||
/* Align SRC to 16 byte alignment so that we don't cross cache line
|
/* Align SRC to 16 byte alignment so that we don't cross cache line
|
||||||
@ -150,92 +142,83 @@ L(copy96):
|
|||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
L(copy_long):
|
L(copy_long):
|
||||||
mov B_l, Q_l
|
ldr A_q, [src]
|
||||||
mov B_h, Q_h
|
|
||||||
ldp A_l, A_h, [src]
|
|
||||||
and tmp1, src, 15
|
and tmp1, src, 15
|
||||||
bic src, src, 15
|
bic src, src, 15
|
||||||
sub dst, dstin, tmp1
|
sub dst, dstin, tmp1
|
||||||
add count, count, tmp1 /* Count is now 16 too large. */
|
add count, count, tmp1 /* Count is now 16 too large. */
|
||||||
ldp Q_l, Q_h, [src, 16]!
|
ldr Q_q, [src, 16]!
|
||||||
stp A_l, A_h, [dstin]
|
str A_q, [dstin]
|
||||||
ldp A_l, A_h, [src, 16]!
|
ldr A_q, [src, 16]!
|
||||||
subs count, count, 32 + 64 + 16 /* Test and readjust count. */
|
subs count, count, 32 + 64 + 16 /* Test and readjust count. */
|
||||||
b.ls L(last64)
|
b.ls L(last64)
|
||||||
|
|
||||||
L(loop64):
|
L(loop64):
|
||||||
subs count, count, 32
|
subs count, count, 32
|
||||||
stp Q_l, Q_h, [dst, 16]
|
str Q_q, [dst, 16]
|
||||||
ldp Q_l, Q_h, [src, 16]!
|
ldr Q_q, [src, 16]!
|
||||||
stp A_l, A_h, [dst, 32]!
|
str A_q, [dst, 32]!
|
||||||
ldp A_l, A_h, [src, 16]!
|
ldr A_q, [src, 16]!
|
||||||
b.hi L(loop64)
|
b.hi L(loop64)
|
||||||
|
|
||||||
/* Write the last full set of 64 bytes. The remainder is at most 64
|
/* Write the last full set of 64 bytes. The remainder is at most 64
|
||||||
bytes and at least 33 bytes, so it is safe to always copy 64 bytes
|
bytes and at least 33 bytes, so it is safe to always copy 64 bytes
|
||||||
from the end. */
|
from the end. */
|
||||||
L(last64):
|
L(last64):
|
||||||
ldp C_l, C_h, [srcend, -64]
|
ldr C_q, [srcend, -64]
|
||||||
stp Q_l, Q_h, [dst, 16]
|
str Q_q, [dst, 16]
|
||||||
mov Q_l, B_l
|
ldr B_q, [srcend, -48]
|
||||||
mov Q_h, B_h
|
str A_q, [dst, 32]
|
||||||
ldp B_l, B_h, [srcend, -48]
|
ldr A_q, [srcend, -32]
|
||||||
stp A_l, A_h, [dst, 32]
|
ldr D_q, [srcend, -16]
|
||||||
ldp A_l, A_h, [srcend, -32]
|
str C_q, [dstend, -64]
|
||||||
ldp D_l, D_h, [srcend, -16]
|
str B_q, [dstend, -48]
|
||||||
stp C_l, C_h, [dstend, -64]
|
str A_q, [dstend, -32]
|
||||||
stp B_l, B_h, [dstend, -48]
|
str D_q, [dstend, -16]
|
||||||
stp A_l, A_h, [dstend, -32]
|
|
||||||
stp D_l, D_h, [dstend, -16]
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
L(move_long):
|
L(move_long):
|
||||||
cbz tmp1, 3f
|
cbz tmp1, 3f
|
||||||
|
|
||||||
mov B_l, Q_l
|
|
||||||
mov B_h, Q_h
|
|
||||||
|
|
||||||
/* Align SRCEND to 16 byte alignment so that we don't cross cache line
|
/* Align SRCEND to 16 byte alignment so that we don't cross cache line
|
||||||
boundaries on both loads and stores. There are at least 96 bytes
|
boundaries on both loads and stores. There are at least 96 bytes
|
||||||
to copy, so copy 16 bytes unaligned and then align. The loop
|
to copy, so copy 16 bytes unaligned and then align. The loop
|
||||||
copies 32 bytes per iteration and prefetches one iteration ahead. */
|
copies 32 bytes per iteration and prefetches one iteration ahead. */
|
||||||
|
|
||||||
ldp A_l, A_h, [srcend, -16]
|
ldr A_q, [srcend, -16]
|
||||||
and tmp1, srcend, 15
|
and tmp1, srcend, 15
|
||||||
sub srcend, srcend, tmp1
|
sub srcend, srcend, tmp1
|
||||||
ldp Q_l, Q_h, [srcend, -16]!
|
ldr Q_q, [srcend, -16]!
|
||||||
stp A_l, A_h, [dstend, -16]
|
str A_q, [dstend, -16]
|
||||||
sub count, count, tmp1
|
sub count, count, tmp1
|
||||||
ldp A_l, A_h, [srcend, -16]!
|
ldr A_q, [srcend, -16]!
|
||||||
sub dstend, dstend, tmp1
|
sub dstend, dstend, tmp1
|
||||||
subs count, count, 32 + 64
|
subs count, count, 32 + 64
|
||||||
b.ls 2f
|
b.ls 2f
|
||||||
|
|
||||||
1:
|
1:
|
||||||
subs count, count, 32
|
subs count, count, 32
|
||||||
stp Q_l, Q_h, [dstend, -16]
|
str Q_q, [dstend, -16]
|
||||||
ldp Q_l, Q_h, [srcend, -16]!
|
ldr Q_q, [srcend, -16]!
|
||||||
stp A_l, A_h, [dstend, -32]!
|
str A_q, [dstend, -32]!
|
||||||
ldp A_l, A_h, [srcend, -16]!
|
ldr A_q, [srcend, -16]!
|
||||||
b.hi 1b
|
b.hi 1b
|
||||||
|
|
||||||
/* Write the last full set of 64 bytes. The remainder is at most 64
|
/* Write the last full set of 64 bytes. The remainder is at most 64
|
||||||
bytes and at least 33 bytes, so it is safe to always copy 64 bytes
|
bytes and at least 33 bytes, so it is safe to always copy 64 bytes
|
||||||
from the start. */
|
from the start. */
|
||||||
2:
|
2:
|
||||||
ldp C_l, C_h, [src, 48]
|
ldr C_q, [src, 48]
|
||||||
stp Q_l, Q_h, [dstend, -16]
|
str Q_q, [dstend, -16]
|
||||||
mov Q_l, B_l
|
ldr B_q, [src, 32]
|
||||||
mov Q_h, B_h
|
str A_q, [dstend, -32]
|
||||||
ldp B_l, B_h, [src, 32]
|
ldr A_q, [src, 16]
|
||||||
stp A_l, A_h, [dstend, -32]
|
ldr D_q, [src]
|
||||||
ldp A_l, A_h, [src, 16]
|
str C_q, [dstin, 48]
|
||||||
ldp D_l, D_h, [src]
|
str B_q, [dstin, 32]
|
||||||
stp C_l, C_h, [dstin, 48]
|
str A_q, [dstin, 16]
|
||||||
stp B_l, B_h, [dstin, 32]
|
str D_q, [dstin]
|
||||||
stp A_l, A_h, [dstin, 16]
|
|
||||||
stp D_l, D_h, [dstin]
|
|
||||||
3: ret
|
3: ret
|
||||||
|
|
||||||
END (__memmove_falkor)
|
END (__memmove_falkor)
|
||||||
|
Loading…
Reference in New Issue
Block a user