mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-26 23:10:06 +00:00
2149da3683
The memcpy optimization (commit 587a1290a1
) has a series
of mistakes:
- The implementation is wrong: the chunk size calculation is wrong
leading to invalid memory access.
- It adds ifunc supports as default, so --disable-multi-arch does
not work as expected for riscv.
- It mixes Linux files (memcpy ifunc selection which requires the
vDSO/syscall mechanism) with generic support (the memcpy
optimization itself).
- There is no __libc_ifunc_impl_list, which makes testing only
check the selected implementation instead of all supported
by the system.
This patch also simplifies the required bits to enable ifunc: there
is no need to memcopy.h; nor to add Linux-specific files.
The __memcpy_noalignment tail handling now uses a branchless strategy
similar to aarch64 (overlap 32-bits copies for sizes 4..7 and byte
copies for size 1..3).
Checked on riscv64 and riscv32 by explicitly enabling the function
on __libc_ifunc_impl_list on qemu-system.
Changes from v1:
* Implement the memcpy in assembly to correctly handle RISCV
strict-alignment.
Reviewed-by: Evan Green <evan@rivosinc.com>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
163 lines
4.0 KiB
ArmAsm
163 lines
4.0 KiB
ArmAsm
/* memcpy for RISC-V, ignoring buffer alignment
|
|
Copyright (C) 2024 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library. If not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#include <sysdep.h>
|
|
#include <sys/asm.h>
|
|
|
|
/* memcpy optimization for CPUs with fast unaligned support
|
|
(RISCV_HWPROBE_MISALIGNED_FAST).
|
|
|
|
Copies are split into 3 main cases: small copies up to SZREG, copies up to
|
|
BLOCK_SIZE (128 for 64 bits, 64 for 32 bits), and copies larger than BLOCK_SIZE.
|
|
|
|
Large copies use a software pipelined loop processing BLOCK_SIZE bytes per
|
|
iteration. The destination pointer is SZREG-byte aligned to minimize store
|
|
unaligned accesses.
|
|
|
|
The tail is handled with branchless copies. */
|
|
|
|
#define BLOCK_SIZE (16 * SZREG)
|
|
|
|
.attribute unaligned_access, 1
|
|
ENTRY (__memcpy_noalignment)
|
|
beq a2, zero, L(ret)
|
|
|
|
/* if LEN < SZREG jump to tail handling. */
|
|
li a5, SZREG-1
|
|
mv a6, a0
|
|
bleu a2, a5, L(tail)
|
|
|
|
/* Copy the first word, align DEST to word, and adjust DEST/SRC/LEN
|
|
based on the amount adjusted to align DEST. */
|
|
REG_L a3, 0(a1)
|
|
andi a5, a0, SZREG-1
|
|
addi a2, a2, -SZREG
|
|
li a4, SZREG
|
|
sub a4, a4, a5
|
|
REG_S a3, 0(a0)
|
|
add a2, a5, a2
|
|
|
|
/* If LEN < BLOCK_SIZE jump to word copy. */
|
|
li a3, BLOCK_SIZE-1
|
|
add a5, a0, a4
|
|
add a1, a1, a4
|
|
bleu a2, a3, L(word_copy_adjust)
|
|
addi a7, a2, -BLOCK_SIZE
|
|
andi a7, a7, -BLOCK_SIZE
|
|
addi a7, a7, BLOCK_SIZE
|
|
add a3, a5, a7
|
|
mv a4, a1
|
|
L(block_copy):
|
|
REG_L a6, 0(a4)
|
|
REG_L t0, SZREG(a4)
|
|
REG_L t1, (2*SZREG)(a4)
|
|
REG_L t2, (3*SZREG)(a4)
|
|
REG_L t3, (4*SZREG)(a4)
|
|
REG_L t4, (5*SZREG)(a4)
|
|
REG_L t5, (6*SZREG)(a4)
|
|
REG_L t6, (7*SZREG)(a4)
|
|
REG_S a6, 0(a5)
|
|
REG_S t0, SZREG(a5)
|
|
REG_S t1, (2*SZREG)(a5)
|
|
REG_S t2, (3*SZREG)(a5)
|
|
REG_S t3, (4*SZREG)(a5)
|
|
REG_S t4, (5*SZREG)(a5)
|
|
REG_S t5, (6*SZREG)(a5)
|
|
REG_S t6, (7*SZREG)(a5)
|
|
REG_L a6, (8*SZREG)(a4)
|
|
REG_L t0, (9*SZREG)(a4)
|
|
REG_L t1, (10*SZREG)(a4)
|
|
REG_L t2, (11*SZREG)(a4)
|
|
REG_L t3, (12*SZREG)(a4)
|
|
REG_L t4, (13*SZREG)(a4)
|
|
REG_L t5, (14*SZREG)(a4)
|
|
REG_L t6, (15*SZREG)(a4)
|
|
addi a4, a4, BLOCK_SIZE
|
|
REG_S a6, (8*SZREG)(a5)
|
|
REG_S t0, (9*SZREG)(a5)
|
|
REG_S t1, (10*SZREG)(a5)
|
|
REG_S t2, (11*SZREG)(a5)
|
|
REG_S t3, (12*SZREG)(a5)
|
|
REG_S t4, (13*SZREG)(a5)
|
|
REG_S t5, (14*SZREG)(a5)
|
|
REG_S t6, (15*SZREG)(a5)
|
|
addi a5, a5, BLOCK_SIZE
|
|
bne a5, a3, L(block_copy)
|
|
add a1, a1, a7
|
|
andi a2, a2, BLOCK_SIZE-1
|
|
|
|
/* 0 <= a2/LEN < BLOCK_SIZE. */
|
|
L(word_copy):
|
|
li a5, SZREG-1
|
|
/* if LEN < SZREG jump to tail handling. */
|
|
bleu a2, a5, L(tail_adjust)
|
|
addi a7, a2, -SZREG
|
|
andi a7, a7, -SZREG
|
|
addi a7, a7, SZREG
|
|
add a6, a3, a7
|
|
mv a5, a1
|
|
L(word_copy_loop):
|
|
REG_L a4, 0(a5)
|
|
addi a3, a3, SZREG
|
|
addi a5, a5, SZREG
|
|
REG_S a4, -SZREG(a3)
|
|
bne a3, a6, L(word_copy_loop)
|
|
add a1, a1, a7
|
|
andi a2, a2, SZREG-1
|
|
|
|
/* Copy the last word unaligned. */
|
|
add a3, a1, a2
|
|
add a4, a6, a2
|
|
REG_L t0, -SZREG(a3)
|
|
REG_S t0, -SZREG(a4)
|
|
ret
|
|
|
|
L(tail):
|
|
/* Copy 4-7 bytes. */
|
|
andi a5, a2, 4
|
|
add a3, a1, a2
|
|
add a4, a6, a2
|
|
beq a5, zero, L(copy_0_3)
|
|
lw t0, 0(a1)
|
|
lw t1, -4(a3)
|
|
sw t0, 0(a6)
|
|
sw t1, -4(a4)
|
|
ret
|
|
|
|
/* Copy 0-3 bytes. */
|
|
L(copy_0_3):
|
|
beq a2, zero, L(ret)
|
|
srli a2, a2, 1
|
|
add t4, a1, a2
|
|
add t5, a6, a2
|
|
lbu t0, 0(a1)
|
|
lbu t1, -1(a3)
|
|
lbu t2, 0(t4)
|
|
sb t0, 0(a6)
|
|
sb t1, -1(a4)
|
|
sb t2, 0(t5)
|
|
L(ret):
|
|
ret
|
|
L(tail_adjust):
|
|
mv a6, a3
|
|
j L(tail)
|
|
L(word_copy_adjust):
|
|
mv a3, a5
|
|
j L(word_copy)
|
|
END (__memcpy_noalignment)
|