glibc/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S

381 lines
8.9 KiB
ArmAsm

/* Optimized memmove_unaligned implementation using basic LoongArch instructions.
Copyright (C) 2023-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include <sys/regdef.h>
#include <sys/asm.h>
#if IS_IN (libc)
# define MEMMOVE_NAME __memmove_unaligned
# define LD_64(reg, n) \
ld.d t0, reg, n; \
ld.d t1, reg, n + 8; \
ld.d t2, reg, n + 16; \
ld.d t3, reg, n + 24; \
ld.d t4, reg, n + 32; \
ld.d t5, reg, n + 40; \
ld.d t6, reg, n + 48; \
ld.d t7, reg, n + 56;
# define ST_64(reg, n) \
st.d t0, reg, n; \
st.d t1, reg, n + 8; \
st.d t2, reg, n + 16; \
st.d t3, reg, n + 24; \
st.d t4, reg, n + 32; \
st.d t5, reg, n + 40; \
st.d t6, reg, n + 48; \
st.d t7, reg, n + 56;
LEAF(MEMMOVE_NAME, 3)
add.d a4, a1, a2
add.d a3, a0, a2
beq a1, a0, L(less_1bytes)
move t8, a0
srai.d a6, a2, 4
beqz a6, L(less_16bytes)
srai.d a6, a2, 6
bnez a6, L(more_64bytes)
srai.d a6, a2, 5
beqz a6, L(less_32bytes)
ld.d t0, a1, 0
ld.d t1, a1, 8
ld.d t2, a1, 16
ld.d t3, a1, 24
ld.d t4, a4, -32
ld.d t5, a4, -24
ld.d t6, a4, -16
ld.d t7, a4, -8
st.d t0, a0, 0
st.d t1, a0, 8
st.d t2, a0, 16
st.d t3, a0, 24
st.d t4, a3, -32
st.d t5, a3, -24
st.d t6, a3, -16
st.d t7, a3, -8
jr ra
L(less_32bytes):
ld.d t0, a1, 0
ld.d t1, a1, 8
ld.d t2, a4, -16
ld.d t3, a4, -8
st.d t0, a0, 0
st.d t1, a0, 8
st.d t2, a3, -16
st.d t3, a3, -8
jr ra
L(less_16bytes):
srai.d a6, a2, 3
beqz a6, L(less_8bytes)
ld.d t0, a1, 0
ld.d t1, a4, -8
st.d t0, a0, 0
st.d t1, a3, -8
jr ra
L(less_8bytes):
srai.d a6, a2, 2
beqz a6, L(less_4bytes)
ld.w t0, a1, 0
ld.w t1, a4, -4
st.w t0, a0, 0
st.w t1, a3, -4
jr ra
L(less_4bytes):
srai.d a6, a2, 1
beqz a6, L(less_2bytes)
ld.h t0, a1, 0
ld.h t1, a4, -2
st.h t0, a0, 0
st.h t1, a3, -2
jr ra
L(less_2bytes):
beqz a2, L(less_1bytes)
ld.b t0, a1, 0
st.b t0, a0, 0
jr ra
L(less_1bytes):
jr ra
L(more_64bytes):
sub.d a7, a0, a1
bltu a7, a2, L(copy_backward)
L(copy_forward):
srli.d a0, a0, 3
slli.d a0, a0, 3
beq a0, t8, L(all_align)
addi.d a0, a0, 0x8
sub.d a7, t8, a0
sub.d a1, a1, a7
add.d a2, a7, a2
L(start_unalign_proc):
pcaddi t1, 18
slli.d a6, a7, 3
add.d t1, t1, a6
jr t1
ld.b t0, a1, -7
st.b t0, a0, -7
ld.b t0, a1, -6
st.b t0, a0, -6
ld.b t0, a1, -5
st.b t0, a0, -5
ld.b t0, a1, -4
st.b t0, a0, -4
ld.b t0, a1, -3
st.b t0, a0, -3
ld.b t0, a1, -2
st.b t0, a0, -2
ld.b t0, a1, -1
st.b t0, a0, -1
L(start_over):
addi.d a2, a2, -0x80
blt a2, zero, L(end_unalign_proc)
L(loop_less):
LD_64(a1, 0)
ST_64(a0, 0)
LD_64(a1, 64)
ST_64(a0, 64)
addi.d a0, a0, 0x80
addi.d a1, a1, 0x80
addi.d a2, a2, -0x80
bge a2, zero, L(loop_less)
L(end_unalign_proc):
addi.d a2, a2, 0x80
pcaddi t1, 36
andi t2, a2, 0x78
add.d a1, a1, t2
add.d a0, a0, t2
sub.d t1, t1, t2
jr t1
ld.d t0, a1, -120
st.d t0, a0, -120
ld.d t0, a1, -112
st.d t0, a0, -112
ld.d t0, a1, -104
st.d t0, a0, -104
ld.d t0, a1, -96
st.d t0, a0, -96
ld.d t0, a1, -88
st.d t0, a0, -88
ld.d t0, a1, -80
st.d t0, a0, -80
ld.d t0, a1, -72
st.d t0, a0, -72
ld.d t0, a1, -64
st.d t0, a0, -64
ld.d t0, a1, -56
st.d t0, a0, -56
ld.d t0, a1, -48
st.d t0, a0, -48
ld.d t0, a1, -40
st.d t0, a0, -40
ld.d t0, a1, -32
st.d t0, a0, -32
ld.d t0, a1, -24
st.d t0, a0, -24
ld.d t0, a1, -16
st.d t0, a0, -16
ld.d t0, a1, -8
st.d t0, a0, -8
andi a2, a2, 0x7
pcaddi t1, 18
slli.d a2, a2, 3
sub.d t1, t1, a2
jr t1
ld.b t0, a4, -7
st.b t0, a3, -7
ld.b t0, a4, -6
st.b t0, a3, -6
ld.b t0, a4, -5
st.b t0, a3, -5
ld.b t0, a4, -4
st.b t0, a3, -4
ld.b t0, a4, -3
st.b t0, a3, -3
ld.b t0, a4, -2
st.b t0, a3, -2
ld.b t0, a4, -1
st.b t0, a3, -1
L(end):
move a0, t8
jr ra
L(all_align):
addi.d a1, a1, 0x8
addi.d a0, a0, 0x8
ld.d t0, a1, -8
st.d t0, a0, -8
addi.d a2, a2, -8
b L(start_over)
L(all_align_back):
addi.d a4, a4, -0x8
addi.d a3, a3, -0x8
ld.d t0, a4, 0
st.d t0, a3, 0
addi.d a2, a2, -8
b L(start_over_back)
L(copy_backward):
move a5, a3
srli.d a3, a3, 3
slli.d a3, a3, 3
beq a3, a5, L(all_align_back)
sub.d a7, a3, a5
add.d a4, a4, a7
add.d a2, a7, a2
pcaddi t1, 18
slli.d a6, a7, 3
add.d t1, t1, a6
jr t1
ld.b t0, a4, 6
st.b t0, a3, 6
ld.b t0, a4, 5
st.b t0, a3, 5
ld.b t0, a4, 4
st.b t0, a3, 4
ld.b t0, a4, 3
st.b t0, a3, 3
ld.b t0, a4, 2
st.b t0, a3, 2
ld.b t0, a4, 1
st.b t0, a3, 1
ld.b t0, a4, 0
st.b t0, a3, 0
L(start_over_back):
addi.d a2, a2, -0x80
blt a2, zero, L(end_unalign_proc_back)
L(loop_less_back):
LD_64(a4, -64)
ST_64(a3, -64)
LD_64(a4, -128)
ST_64(a3, -128)
addi.d a4, a4, -0x80
addi.d a3, a3, -0x80
addi.d a2, a2, -0x80
bge a2, zero, L(loop_less_back)
L(end_unalign_proc_back):
addi.d a2, a2, 0x80
pcaddi t1, 36
andi t2, a2, 0x78
sub.d a4, a4, t2
sub.d a3, a3, t2
sub.d t1, t1, t2
jr t1
ld.d t0, a4, 112
st.d t0, a3, 112
ld.d t0, a4, 104
st.d t0, a3, 104
ld.d t0, a4, 96
st.d t0, a3, 96
ld.d t0, a4, 88
st.d t0, a3, 88
ld.d t0, a4, 80
st.d t0, a3, 80
ld.d t0, a4, 72
st.d t0, a3, 72
ld.d t0, a4, 64
st.d t0, a3, 64
ld.d t0, a4, 56
st.d t0, a3, 56
ld.d t0, a4, 48
st.d t0, a3, 48
ld.d t0, a4, 40
st.d t0, a3, 40
ld.d t0, a4, 32
st.d t0, a3, 32
ld.d t0, a4, 24
st.d t0, a3, 24
ld.d t0, a4, 16
st.d t0, a3, 16
ld.d t0, a4, 8
st.d t0, a3, 8
ld.d t0, a4, 0
st.d t0, a3, 0
andi a2, a2, 0x7
pcaddi t1, 18
slli.d a2, a2, 3
sub.d t1, t1, a2
jr t1
ld.b t0, a1, 6
st.b t0, a0, 6
ld.b t0, a1, 5
st.b t0, a0, 5
ld.b t0, a1, 4
st.b t0, a0, 4
ld.b t0, a1, 3
st.b t0, a0, 3
ld.b t0, a1, 2
st.b t0, a0, 2
ld.b t0, a1, 1
st.b t0, a0, 1
ld.b t0, a1, 0
st.b t0, a0, 0
move a0, t8
jr ra
END(MEMMOVE_NAME)
libc_hidden_builtin_def (MEMMOVE_NAME)
#endif