glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

463 lines
12 KiB
ArmAsm
Raw Normal View History

Add x86-64 memmove with unaligned load/store and rep movsb Implement x86-64 memmove with unaligned load/store and rep movsb. Support 16-byte, 32-byte and 64-byte vector register sizes. When size <= 8 times of vector register size, there is no check for address overlap bewteen source and destination. Since overhead for overlap check is small when size > 8 times of vector register size, memcpy is an alias of memmove. A single file provides 2 implementations of memmove, one with rep movsb and the other without rep movsb. They share the same codes when size is between 2 times of vector register size and REP_MOVSB_THRESHOLD which is 2KB for 16-byte vector register size and scaled up by large vector register size. Key features: 1. Use overlapping load and store to avoid branch. 2. For size <= 8 times of vector register size, load all sources into registers and store them together. 3. If there is no address overlap bewteen source and destination, copy from both ends with 4 times of vector register size at a time. 4. If address of destination > address of source, backward copy 8 times of vector register size at a time. 5. Otherwise, forward copy 8 times of vector register size at a time. 6. Use rep movsb only for forward copy. Avoid slow backward rep movsb by fallbacking to backward copy 8 times of vector register size at a time. 7. Skip when address of destination == address of source. [BZ #19776] * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and memmove-avx512-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memmove_chk_avx512_unaligned_2, __memmove_chk_avx512_unaligned_erms, __memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms, __memmove_chk_sse2_unaligned_2, __memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2, __memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2, __memmove_avx512_unaligned_erms, __memmove_erms, __memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms, __memcpy_chk_avx512_unaligned_2, __memcpy_chk_avx512_unaligned_erms, __memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms, __memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms, __memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms, __memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms, __memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms, __memcpy_erms, __mempcpy_chk_avx512_unaligned_2, __mempcpy_chk_avx512_unaligned_erms, __mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms, __mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms, __mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms, __mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms, __mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and __mempcpy_erms. * sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New file. * sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: Likwise.
2016-03-31 17:04:26 +00:00
/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
Copyright (C) 2016 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* memmove/memcpy/mempcpy is implemented as:
1. Use overlapping load and store to avoid branch.
2. Use 8-bit or 32-bit displacements for branches and nop paddings
to avoid long nop between instructions.
3. Load all sources into registers and store them together to avoid
possible address overflap between source and destination.
4. If size is 2 * VEC_SIZE or less, load all sources into registers
and store them together.
5. If there is no address overflap, copy from both ends with
4 * VEC_SIZE at a time.
6. If size is 8 * VEC_SIZE or less, load all sources into registers
and store them together.
7. If address of destination > address of source, backward copy
8 * VEC_SIZE at a time.
8. Otherwise, forward copy 8 * VEC_SIZE at a time. */
#if IS_IN (libc)
# include <sysdep.h>
# include "asm-syntax.h"
# ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
# else
# define VZEROUPPER
# endif
# endif
/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
up REP MOVSB operation, REP MOVSB isn't faster on short data. The
memcpy micro benchmark in glibc shows that 2KB is the approximate
value above which REP MOVSB becomes faster than SSE2 optimization
on processors with Enhanced REP MOVSB. Since larger register size
can move more data with a single load and store, the threshold is
higher with larger register size. */
# ifndef REP_MOVSB_THRESHOLD
# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
# endif
# ifndef SECTION
# error SECTION is not defined!
# endif
.section SECTION(.text),"ax",@progbits
# ifdef SHARED
ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
movq %rdi, %rax
addq %rdx, %rax
jmp L(start)
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
# endif
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
movq %rdi, %rax
L(start):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
cmpq $(VEC_SIZE * 2), %rdx
ja L(more_2x_vec)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU (%rsi), %VEC(0)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
VZEROUPPER
ret
END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
# ifdef SHARED
ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
# endif
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
movq %rdi, %rax
addq %rdx, %rax
jmp L(start_erms)
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
# ifdef SHARED
ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
# endif
# if VEC_SIZE == 16
/* Only used to measure performance of REP MOVSB. */
# ifdef SHARED
ENTRY (__mempcpy_erms)
movq %rdi, %rax
addq %rdx, %rax
jmp L(movsb)
END (__mempcpy_erms)
# endif
ENTRY (__memmove_erms)
movq %rdi, %rax
movq %rdx, %rcx
cmpq %rsi, %rdi
jbe 1f
leaq (%rsi,%rcx), %rdx
cmpq %rdx, %rdi
jb L(movsb_backward)
1:
rep movsb
ret
L(movsb_backward):
leaq -1(%rdi,%rcx), %rdi
leaq -1(%rsi,%rcx), %rsi
std
rep movsb
cld
ret
END (__memmove_erms)
strong_alias (__memmove_erms, __memcpy_erms)
# endif
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
movq %rdi, %rax
L(start_erms):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
cmpq $(VEC_SIZE * 2), %rdx
ja L(movsb_more_2x_vec)
L(last_2x_vec):
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU (%rsi), %VEC(0)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
L(return):
VZEROUPPER
ret
L(movsb):
cmpq %rsi, %rdi
je L(nop)
jb 1f
leaq (%rsi,%rdx), %r9
cmpq %r9, %rdi
/* Avoid slow backward REP MOVSB. */
# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
# endif
jb L(more_8x_vec_backward)
1:
movq %rdx, %rcx
rep movsb
L(nop):
ret
.p2align 4
L(movsb_more_2x_vec):
cmpq $REP_MOVSB_THRESHOLD, %rdx
/* Force 32-bit displacement to avoid long nop between
instructions. */
ja.d32 L(movsb)
.p2align 4
L(more_2x_vec):
/* More than 2 * VEC. */
cmpq %rsi, %rdi
je L(nop)
jb L(copy_forward)
leaq (%rsi,%rdx), %rcx
cmpq %rcx, %rdi
jb L(more_2x_vec_overlap)
L(copy_forward):
leaq (%rdi,%rdx), %rcx
cmpq %rcx, %rsi
jb L(more_2x_vec_overlap)
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
cmpq $(VEC_SIZE * 4), %rdx
/* Force 32-bit displacement to avoid long nop between
instructions. */
jbe.d32 L(return)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(0)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3)
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(1), (VEC_SIZE * 3)(%rdi)
VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
cmpq $(VEC_SIZE * 8), %rdx
# if VEC_SIZE == 16
jbe L(return)
# else
/* Use 8-bit displacement to avoid long nop between
instructions. */
jbe L(return_disp8)
# endif
leaq (VEC_SIZE * 4)(%rdi), %rcx
addq %rdi, %rdx
andq $-(VEC_SIZE * 4), %rdx
andq $-(VEC_SIZE * 4), %rcx
movq %rcx, %r11
subq %rdi, %r11
addq %r11, %rsi
cmpq %rdx, %rcx
/* Use 8-bit displacement to avoid long nop between
instructions. */
je L(return_disp8)
movq %rsi, %r10
subq %rcx, %r10
leaq VEC_SIZE(%r10), %r9
leaq (VEC_SIZE * 2)(%r10), %r8
leaq (VEC_SIZE * 3)(%r10), %r11
.p2align 4
L(loop):
VMOVU (%rcx,%r10), %VEC(0)
VMOVU (%rcx,%r9), %VEC(1)
VMOVU (%rcx,%r8), %VEC(2)
VMOVU (%rcx,%r11), %VEC(3)
VMOVA %VEC(0), (%rcx)
VMOVA %VEC(1), VEC_SIZE(%rcx)
VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
VMOVA %VEC(3), (VEC_SIZE * 3)(%rcx)
addq $(VEC_SIZE * 4), %rcx
cmpq %rcx, %rdx
jne L(loop)
L(return_disp8):
VZEROUPPER
ret
L(less_vec):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
# endif
# if VEC_SIZE > 32
cmpb $32, %dl
jae L(between_32_63)
# endif
# if VEC_SIZE > 16
cmpb $16, %dl
jae L(between_16_31)
# endif
cmpb $8, %dl
jae L(between_8_15)
cmpb $4, %dl
jae L(between_4_7)
cmpb $1, %dl
ja L(between_2_3)
jb 1f
movzbl (%rsi), %ecx
movb %cl, (%rdi)
1:
ret
# if VEC_SIZE > 32
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
vmovdqu (%rsi), %ymm0
vmovdqu -32(%rsi,%rdx), %ymm1
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm1, -32(%rdi,%rdx)
VZEROUPPER
ret
# endif
# if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
vmovdqu (%rsi), %xmm0
vmovdqu -16(%rsi,%rdx), %xmm1
vmovdqu %xmm0, (%rdi)
vmovdqu %xmm1, -16(%rdi,%rdx)
ret
# endif
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
movq -8(%rsi,%rdx), %rcx
movq (%rsi), %rsi
movq %rcx, -8(%rdi,%rdx)
movq %rsi, (%rdi)
ret
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
movl -4(%rsi,%rdx), %ecx
movl (%rsi), %esi
movl %ecx, -4(%rdi,%rdx)
movl %esi, (%rdi)
ret
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
movzwl -2(%rsi,%rdx), %ecx
movzwl (%rsi), %esi
movw %cx, -2(%rdi,%rdx)
movw %si, (%rdi)
ret
# if VEC_SIZE > 16
/* Align to 16 bytes to avoid long nop between instructions. */
.p2align 4
# endif
L(more_2x_vec_overlap):
/* More than 2 * VEC and there is overlap bewteen destination
and source. */
cmpq $(VEC_SIZE * 8), %rdx
ja L(more_8x_vec)
cmpq $(VEC_SIZE * 4), %rdx
jb L(last_4x_vec)
L(between_4x_vec_and_8x_vec):
/* Copy from 4 * VEC to 8 * VEC, inclusively. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
VZEROUPPER
ret
L(last_4x_vec):
/* Copy from 2 * VEC to 4 * VEC. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
VZEROUPPER
ret
L(between_0_and_4x_vec):
/* Copy from 0 to 4 * VEC. */
cmpl $(VEC_SIZE * 2), %edx
jae L(last_4x_vec)
/* Copy from 0 to 2 * VEC. */
cmpl $VEC_SIZE, %edx
jae L(last_2x_vec)
/* Copy from 0 to VEC. */
VZEROUPPER
jmp L(less_vec)
L(more_8x_vec):
cmpq %rsi, %rdi
ja L(more_8x_vec_backward)
.p2align 4
L(loop_8x_vec_forward):
/* Copy 8 * VEC a time forward. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
VMOVU (VEC_SIZE * 4)(%rsi), %VEC(4)
VMOVU (VEC_SIZE * 5)(%rsi), %VEC(5)
VMOVU (VEC_SIZE * 6)(%rsi), %VEC(6)
VMOVU (VEC_SIZE * 7)(%rsi), %VEC(7)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
VMOVU %VEC(4), (VEC_SIZE * 4)(%rdi)
VMOVU %VEC(5), (VEC_SIZE * 5)(%rdi)
VMOVU %VEC(6), (VEC_SIZE * 6)(%rdi)
VMOVU %VEC(7), (VEC_SIZE * 7)(%rdi)
addq $(VEC_SIZE * 8), %rdi
addq $(VEC_SIZE * 8), %rsi
subq $(VEC_SIZE * 8), %rdx
cmpq $(VEC_SIZE * 8), %rdx
je L(between_4x_vec_and_8x_vec)
ja L(loop_8x_vec_forward)
/* Less than 8 * VEC to copy. */
cmpq $(VEC_SIZE * 4), %rdx
jb L(between_0_and_4x_vec)
jmp L(between_4x_vec_and_8x_vec)
.p2align 4
L(more_8x_vec_backward):
leaq -VEC_SIZE(%rsi, %rdx), %rcx
leaq -VEC_SIZE(%rdi, %rdx), %r9
.p2align 4
L(loop_8x_vec_backward):
/* Copy 8 * VEC a time backward. */
VMOVU (%rcx), %VEC(0)
VMOVU -VEC_SIZE(%rcx), %VEC(1)
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
VMOVU -(VEC_SIZE * 4)(%rcx), %VEC(4)
VMOVU -(VEC_SIZE * 5)(%rcx), %VEC(5)
VMOVU -(VEC_SIZE * 6)(%rcx), %VEC(6)
VMOVU -(VEC_SIZE * 7)(%rcx), %VEC(7)
VMOVU %VEC(0), (%r9)
VMOVU %VEC(1), -VEC_SIZE(%r9)
VMOVU %VEC(2), -(VEC_SIZE * 2)(%r9)
VMOVU %VEC(3), -(VEC_SIZE * 3)(%r9)
VMOVU %VEC(4), -(VEC_SIZE * 4)(%r9)
VMOVU %VEC(5), -(VEC_SIZE * 5)(%r9)
VMOVU %VEC(6), -(VEC_SIZE * 6)(%r9)
VMOVU %VEC(7), -(VEC_SIZE * 7)(%r9)
subq $(VEC_SIZE * 8), %rcx
subq $(VEC_SIZE * 8), %r9
subq $(VEC_SIZE * 8), %rdx
cmpq $(VEC_SIZE * 8), %rdx
je L(between_4x_vec_and_8x_vec)
ja L(loop_8x_vec_backward)
/* Less than 8 * VEC to copy. */
cmpq $(VEC_SIZE * 4), %rdx
jb L(between_0_and_4x_vec)
jmp L(between_4x_vec_and_8x_vec)
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
# ifdef SHARED
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
MEMMOVE_SYMBOL (__memcpy, unaligned_2))
strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2),
MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2))
# endif
#endif