mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-05 17:11:06 +00:00
b9f145df85
Increase the upper bound on medium cases from 96 to 128 bytes. Now, up to 128 bytes are copied unrolled. Increase the upper bound on small cases from 16 to 32 bytes so that copies of 17-32 bytes are not impacted by the larger medium case. Benchmarking: The attached figures show relative timing difference with respect to 'memcpy_generic', which is the existing implementation. 'memcpy_med_128' denotes the the version of memcpy_generic with only the medium case enlarged. The 'memcpy_med_128_small_32' numbers are for the version of memcpy_generic submitted in this patch, which has both medium and small cases enlarged. The figures were generated using the script from: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00563.html Depending on the platform, the performance improvement in the bench-memcpy-random.c benchmark ranges from 6% to 20% between the original and final version of memcpy.S Tested against GLIBC testsuite and randomized tests.
280 lines
6.7 KiB
ArmAsm
280 lines
6.7 KiB
ArmAsm
/* Copyright (C) 2012-2019 Free Software Foundation, Inc.
|
|
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library. If not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#include <sysdep.h>
|
|
|
|
/* Assumptions:
|
|
*
|
|
* ARMv8-a, AArch64, unaligned accesses.
|
|
*
|
|
*/
|
|
|
|
#define dstin x0
|
|
#define src x1
|
|
#define count x2
|
|
#define dst x3
|
|
#define srcend x4
|
|
#define dstend x5
|
|
#define A_l x6
|
|
#define A_lw w6
|
|
#define A_h x7
|
|
#define A_hw w7
|
|
#define B_l x8
|
|
#define B_lw w8
|
|
#define B_h x9
|
|
#define C_l x10
|
|
#define C_h x11
|
|
#define D_l x12
|
|
#define D_h x13
|
|
#define E_l x14
|
|
#define E_h x15
|
|
#define F_l x16
|
|
#define F_h x17
|
|
#define G_l count
|
|
#define G_h dst
|
|
#define H_l src
|
|
#define H_h srcend
|
|
#define tmp1 x14
|
|
|
|
/* Copies are split into 3 main cases: small copies of up to 32 bytes,
|
|
medium copies of 33..128 bytes which are fully unrolled. Large copies
|
|
of more than 128 bytes align the destination and use an unrolled loop
|
|
processing 64 bytes per iteration.
|
|
In order to share code with memmove, small and medium copies read all
|
|
data before writing, allowing any kind of overlap. So small, medium
|
|
and large backwards memmoves are handled by falling through into memcpy.
|
|
Overlapping large forward memmoves use a loop that copies backwards.
|
|
*/
|
|
|
|
#ifndef MEMMOVE
|
|
# define MEMMOVE memmove
|
|
#endif
|
|
#ifndef MEMCPY
|
|
# define MEMCPY memcpy
|
|
#endif
|
|
|
|
ENTRY_ALIGN (MEMMOVE, 6)
|
|
|
|
DELOUSE (0)
|
|
DELOUSE (1)
|
|
DELOUSE (2)
|
|
|
|
sub tmp1, dstin, src
|
|
cmp count, 128
|
|
ccmp tmp1, count, 2, hi
|
|
b.lo L(move_long)
|
|
|
|
/* Common case falls through into memcpy. */
|
|
END (MEMMOVE)
|
|
libc_hidden_builtin_def (MEMMOVE)
|
|
ENTRY (MEMCPY)
|
|
|
|
DELOUSE (0)
|
|
DELOUSE (1)
|
|
DELOUSE (2)
|
|
|
|
prfm PLDL1KEEP, [src]
|
|
add srcend, src, count
|
|
add dstend, dstin, count
|
|
cmp count, 32
|
|
b.ls L(copy32)
|
|
cmp count, 128
|
|
b.hi L(copy_long)
|
|
|
|
/* Medium copies: 33..128 bytes. */
|
|
ldp A_l, A_h, [src]
|
|
ldp B_l, B_h, [src, 16]
|
|
ldp C_l, C_h, [srcend, -32]
|
|
ldp D_l, D_h, [srcend, -16]
|
|
cmp count, 64
|
|
b.hi L(copy128)
|
|
stp A_l, A_h, [dstin]
|
|
stp B_l, B_h, [dstin, 16]
|
|
stp C_l, C_h, [dstend, -32]
|
|
stp D_l, D_h, [dstend, -16]
|
|
ret
|
|
|
|
.p2align 4
|
|
/* Small copies: 0..32 bytes. */
|
|
L(copy32):
|
|
/* 16-32 bytes. */
|
|
cmp count, 16
|
|
b.lo 1f
|
|
ldp A_l, A_h, [src]
|
|
ldp B_l, B_h, [srcend, -16]
|
|
stp A_l, A_h, [dstin]
|
|
stp B_l, B_h, [dstend, -16]
|
|
ret
|
|
.p2align 4
|
|
1:
|
|
/* 8-15 bytes. */
|
|
tbz count, 3, 1f
|
|
ldr A_l, [src]
|
|
ldr A_h, [srcend, -8]
|
|
str A_l, [dstin]
|
|
str A_h, [dstend, -8]
|
|
ret
|
|
.p2align 4
|
|
1:
|
|
/* 4-7 bytes. */
|
|
tbz count, 2, 1f
|
|
ldr A_lw, [src]
|
|
ldr A_hw, [srcend, -4]
|
|
str A_lw, [dstin]
|
|
str A_hw, [dstend, -4]
|
|
ret
|
|
|
|
/* Copy 0..3 bytes. Use a branchless sequence that copies the same
|
|
byte 3 times if count==1, or the 2nd byte twice if count==2. */
|
|
1:
|
|
cbz count, 2f
|
|
lsr tmp1, count, 1
|
|
ldrb A_lw, [src]
|
|
ldrb A_hw, [srcend, -1]
|
|
ldrb B_lw, [src, tmp1]
|
|
strb A_lw, [dstin]
|
|
strb B_lw, [dstin, tmp1]
|
|
strb A_hw, [dstend, -1]
|
|
2: ret
|
|
|
|
.p2align 4
|
|
/* Copy 65..128 bytes. Copy 64 bytes from the start and
|
|
64 bytes from the end. */
|
|
L(copy128):
|
|
ldp E_l, E_h, [src, 32]
|
|
ldp F_l, F_h, [src, 48]
|
|
ldp G_l, G_h, [srcend, -64]
|
|
ldp H_l, H_h, [srcend, -48]
|
|
stp A_l, A_h, [dstin]
|
|
stp B_l, B_h, [dstin, 16]
|
|
stp E_l, E_h, [dstin, 32]
|
|
stp F_l, F_h, [dstin, 48]
|
|
stp G_l, G_h, [dstend, -64]
|
|
stp H_l, H_h, [dstend, -48]
|
|
stp C_l, C_h, [dstend, -32]
|
|
stp D_l, D_h, [dstend, -16]
|
|
ret
|
|
|
|
/* Align DST to 16 byte alignment so that we don't cross cache line
|
|
boundaries on both loads and stores. There are at least 128 bytes
|
|
to copy, so copy 16 bytes unaligned and then align. The loop
|
|
copies 64 bytes per iteration and prefetches one iteration ahead. */
|
|
|
|
.p2align 4
|
|
L(copy_long):
|
|
and tmp1, dstin, 15
|
|
bic dst, dstin, 15
|
|
ldp D_l, D_h, [src]
|
|
sub src, src, tmp1
|
|
add count, count, tmp1 /* Count is now 16 too large. */
|
|
ldp A_l, A_h, [src, 16]
|
|
stp D_l, D_h, [dstin]
|
|
ldp B_l, B_h, [src, 32]
|
|
ldp C_l, C_h, [src, 48]
|
|
ldp D_l, D_h, [src, 64]!
|
|
subs count, count, 128 + 16 /* Test and readjust count. */
|
|
b.ls L(last64)
|
|
L(loop64):
|
|
stp A_l, A_h, [dst, 16]
|
|
ldp A_l, A_h, [src, 16]
|
|
stp B_l, B_h, [dst, 32]
|
|
ldp B_l, B_h, [src, 32]
|
|
stp C_l, C_h, [dst, 48]
|
|
ldp C_l, C_h, [src, 48]
|
|
stp D_l, D_h, [dst, 64]!
|
|
ldp D_l, D_h, [src, 64]!
|
|
subs count, count, 64
|
|
b.hi L(loop64)
|
|
|
|
/* Write the last full set of 64 bytes. The remainder is at most 64
|
|
bytes, so it is safe to always copy 64 bytes from the end even if
|
|
there is just 1 byte left. */
|
|
L(last64):
|
|
ldp E_l, E_h, [srcend, -64]
|
|
stp A_l, A_h, [dst, 16]
|
|
ldp A_l, A_h, [srcend, -48]
|
|
stp B_l, B_h, [dst, 32]
|
|
ldp B_l, B_h, [srcend, -32]
|
|
stp C_l, C_h, [dst, 48]
|
|
ldp C_l, C_h, [srcend, -16]
|
|
stp D_l, D_h, [dst, 64]
|
|
stp E_l, E_h, [dstend, -64]
|
|
stp A_l, A_h, [dstend, -48]
|
|
stp B_l, B_h, [dstend, -32]
|
|
stp C_l, C_h, [dstend, -16]
|
|
ret
|
|
|
|
.p2align 4
|
|
L(move_long):
|
|
cbz tmp1, 3f
|
|
|
|
add srcend, src, count
|
|
add dstend, dstin, count
|
|
|
|
/* Align dstend to 16 byte alignment so that we don't cross cache line
|
|
boundaries on both loads and stores. There are at least 128 bytes
|
|
to copy, so copy 16 bytes unaligned and then align. The loop
|
|
copies 64 bytes per iteration and prefetches one iteration ahead. */
|
|
|
|
and tmp1, dstend, 15
|
|
ldp D_l, D_h, [srcend, -16]
|
|
sub srcend, srcend, tmp1
|
|
sub count, count, tmp1
|
|
ldp A_l, A_h, [srcend, -16]
|
|
stp D_l, D_h, [dstend, -16]
|
|
ldp B_l, B_h, [srcend, -32]
|
|
ldp C_l, C_h, [srcend, -48]
|
|
ldp D_l, D_h, [srcend, -64]!
|
|
sub dstend, dstend, tmp1
|
|
subs count, count, 128
|
|
b.ls 2f
|
|
|
|
nop
|
|
1:
|
|
stp A_l, A_h, [dstend, -16]
|
|
ldp A_l, A_h, [srcend, -16]
|
|
stp B_l, B_h, [dstend, -32]
|
|
ldp B_l, B_h, [srcend, -32]
|
|
stp C_l, C_h, [dstend, -48]
|
|
ldp C_l, C_h, [srcend, -48]
|
|
stp D_l, D_h, [dstend, -64]!
|
|
ldp D_l, D_h, [srcend, -64]!
|
|
subs count, count, 64
|
|
b.hi 1b
|
|
|
|
/* Write the last full set of 64 bytes. The remainder is at most 64
|
|
bytes, so it is safe to always copy 64 bytes from the start even if
|
|
there is just 1 byte left. */
|
|
2:
|
|
ldp G_l, G_h, [src, 48]
|
|
stp A_l, A_h, [dstend, -16]
|
|
ldp A_l, A_h, [src, 32]
|
|
stp B_l, B_h, [dstend, -32]
|
|
ldp B_l, B_h, [src, 16]
|
|
stp C_l, C_h, [dstend, -48]
|
|
ldp C_l, C_h, [src]
|
|
stp D_l, D_h, [dstend, -64]
|
|
stp G_l, G_h, [dstin, 48]
|
|
stp A_l, A_h, [dstin, 32]
|
|
stp B_l, B_h, [dstin, 16]
|
|
stp C_l, C_h, [dstin]
|
|
3: ret
|
|
|
|
END (MEMCPY)
|
|
libc_hidden_builtin_def (MEMCPY)
|