x86: Optimize memmove-vec-unaligned-erms.S

No bug.

The optimizations are as follows:

1) Always align entry to 64 bytes. This makes behavior more
   predictable and makes other frontend optimizations easier.

2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
   significant benefits in the case that:
        0 < (dst - src) < [256, 512]

3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
   improvement and for FSRM [-10%, 25%].

In addition to these primary changes there is general cleanup
throughout to optimize the aligning routines and control flow logic.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit a6b7502ec0)
This commit is contained in:
Noah Goldstein 2021-11-01 00:49:51 -05:00 committed by Sunil K Pandey
parent 2e64237a87
commit a7392db2ff
6 changed files with 383 additions and 226 deletions

View File

@ -25,7 +25,7 @@
/* Use movups and movaps for smaller code sizes. */
#define VMOVU movups
#define VMOVA movaps
#define MOV_SIZE 3
#define SECTION(p) p
#ifdef USE_MULTIARCH

View File

@ -4,7 +4,7 @@
# define VMOVNT vmovntdq
# define VMOVU vmovdqu
# define VMOVA vmovdqa
# define MOV_SIZE 4
# define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST

View File

@ -4,7 +4,7 @@
# define VMOVNT vmovntdq
# define VMOVU vmovdqu
# define VMOVA vmovdqa
# define MOV_SIZE 4
# define SECTION(p) p##.avx
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s

View File

@ -25,7 +25,7 @@
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
# define VZEROUPPER
# define MOV_SIZE 6
# define SECTION(p) p##.evex512
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s

View File

@ -25,7 +25,7 @@
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
# define VZEROUPPER
# define MOV_SIZE 6
# define SECTION(p) p##.evex
# define MEMMOVE_SYMBOL(p,s) p##_evex_##s

View File

@ -76,6 +76,25 @@
# endif
#endif
/* Whether to align before movsb. Ultimately we want 64 byte
align and not worth it to load 4x VEC for VEC_SIZE == 16. */
#define ALIGN_MOVSB (VEC_SIZE > 16)
/* Number of bytes to align movsb to. */
#define MOVSB_ALIGN_TO 64
#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
#define LARGE_MOV_SIZE (MOV_SIZE > 4)
#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
# error MOV_SIZE Unknown
#endif
#if LARGE_MOV_SIZE
# define SMALL_SIZE_OFFSET (4)
#else
# define SMALL_SIZE_OFFSET (0)
#endif
#ifndef PAGE_SIZE
# define PAGE_SIZE 4096
#endif
@ -199,25 +218,21 @@ L(start):
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
/* Load regardless. */
VMOVU (%rsi), %VEC(0)
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(more_2x_vec)
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(last_2x_vec):
#endif
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU (%rsi), %VEC(0)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(nop):
ret
#if !(defined USE_MULTIARCH && IS_IN (libc))
ZERO_UPPER_VEC_REGISTERS_RETURN
#else
VZEROUPPER_RETURN
#endif
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMMOVE_SYMBOL (__memmove, unaligned))
# if VEC_SIZE == 16
ENTRY (__mempcpy_chk_erms)
cmp %RDX_LP, %RCX_LP
@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
# endif
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
movq %rdi, %rax
L(start_erms):
# ifdef __ILP32__
@ -298,310 +313,448 @@ L(start_erms):
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
/* Load regardless. */
VMOVU (%rsi), %VEC(0)
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(movsb_more_2x_vec)
L(last_2x_vec):
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU (%rsi), %VEC(0)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
*/
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
L(return):
#if VEC_SIZE > 16
# if VEC_SIZE > 16
ZERO_UPPER_VEC_REGISTERS_RETURN
#else
ret
#endif
L(movsb):
cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
jae L(more_8x_vec)
cmpq %rsi, %rdi
jb 1f
/* Source == destination is less common. */
je L(nop)
leaq (%rsi,%rdx), %r9
cmpq %r9, %rdi
/* Avoid slow backward REP MOVSB. */
jb L(more_8x_vec_backward)
# if AVOID_SHORT_DISTANCE_REP_MOVSB
testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
jz 3f
movq %rdi, %rcx
subq %rsi, %rcx
jmp 2f
# endif
1:
# if AVOID_SHORT_DISTANCE_REP_MOVSB
testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
jz 3f
movq %rsi, %rcx
subq %rdi, %rcx
2:
/* Avoid "rep movsb" if RCX, the distance between source and destination,
is N*4GB + [1..63] with N >= 0. */
cmpl $63, %ecx
jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
3:
# endif
mov %RDX_LP, %RCX_LP
rep movsb
L(nop):
# else
ret
# endif
#endif
#if LARGE_MOV_SIZE
/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
ENTRY block and L(less_vec). */
.p2align 4,, 8
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
movl (%rsi), %ecx
movl (%rsi, %rdx), %esi
movl %ecx, (%rdi)
movl %esi, (%rdi, %rdx)
ret
#endif
.p2align 4
L(less_vec):
/* Less than 1 VEC. */
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
#endif
#if VEC_SIZE > 32
cmpb $32, %dl
cmpl $32, %edx
jae L(between_32_63)
#endif
#if VEC_SIZE > 16
cmpb $16, %dl
cmpl $16, %edx
jae L(between_16_31)
#endif
cmpb $8, %dl
cmpl $8, %edx
jae L(between_8_15)
cmpb $4, %dl
#if SMALL_MOV_SIZE
cmpl $4, %edx
#else
subq $4, %rdx
#endif
jae L(between_4_7)
cmpb $1, %dl
ja L(between_2_3)
jb 1f
movzbl (%rsi), %ecx
cmpl $(1 - SMALL_SIZE_OFFSET), %edx
jl L(copy_0)
movb (%rsi), %cl
je L(copy_1)
movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
L(copy_1):
movb %cl, (%rdi)
1:
L(copy_0):
ret
#if SMALL_MOV_SIZE
.p2align 4,, 8
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
movl -4(%rsi, %rdx), %ecx
movl (%rsi), %esi
movl %ecx, -4(%rdi, %rdx)
movl %esi, (%rdi)
ret
#endif
#if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
.p2align 4,, 8
L(between_16_31):
vmovdqu (%rsi), %xmm0
vmovdqu -16(%rsi, %rdx), %xmm1
vmovdqu %xmm0, (%rdi)
vmovdqu %xmm1, -16(%rdi, %rdx)
/* No ymm registers have been touched. */
ret
#endif
#if VEC_SIZE > 32
.p2align 4,, 10
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
VMOVU (%rsi), %YMM0
VMOVU -32(%rsi,%rdx), %YMM1
VMOVU -32(%rsi, %rdx), %YMM1
VMOVU %YMM0, (%rdi)
VMOVU %YMM1, -32(%rdi,%rdx)
VZEROUPPER_RETURN
#endif
#if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
VMOVU (%rsi), %XMM0
VMOVU -16(%rsi,%rdx), %XMM1
VMOVU %XMM0, (%rdi)
VMOVU %XMM1, -16(%rdi,%rdx)
VMOVU %YMM1, -32(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
.p2align 4,, 10
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
movq -8(%rsi,%rdx), %rcx
movq -8(%rsi, %rdx), %rcx
movq (%rsi), %rsi
movq %rcx, -8(%rdi,%rdx)
movq %rsi, (%rdi)
ret
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
movl -4(%rsi,%rdx), %ecx
movl (%rsi), %esi
movl %ecx, -4(%rdi,%rdx)
movl %esi, (%rdi)
ret
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
movzwl -2(%rsi,%rdx), %ecx
movzwl (%rsi), %esi
movw %cx, -2(%rdi,%rdx)
movw %si, (%rdi)
movq %rcx, -8(%rdi, %rdx)
ret
.p2align 4,, 10
L(last_4x_vec):
/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
/* VEC(0) and VEC(1) have already been loaded. */
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2)
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx)
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
VZEROUPPER_RETURN
.p2align 4
#if defined USE_MULTIARCH && IS_IN (libc)
L(movsb_more_2x_vec):
cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
ja L(movsb)
#endif
L(more_2x_vec):
/* More than 2 * VEC and there may be overlap between destination
and source. */
/* More than 2 * VEC and there may be overlap between
destination and source. */
cmpq $(VEC_SIZE * 8), %rdx
ja L(more_8x_vec)
/* Load VEC(1) regardless. VEC(0) has already been loaded. */
VMOVU VEC_SIZE(%rsi), %VEC(1)
cmpq $(VEC_SIZE * 4), %rdx
jbe L(last_4x_vec)
/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4)
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
VZEROUPPER_RETURN
L(last_4x_vec):
/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx)
VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
VZEROUPPER_RETURN
.p2align 4,, 4
L(more_8x_vec):
movq %rdi, %rcx
subq %rsi, %rcx
/* Go to backwards temporal copy if overlap no matter what as
backward REP MOVSB is slow and we don't want to use NT stores if
there is overlap. */
cmpq %rdx, %rcx
/* L(more_8x_vec_backward_check_nop) checks for src == dst. */
jb L(more_8x_vec_backward_check_nop)
/* Check if non-temporal move candidate. */
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
/* Check non-temporal store threshold. */
cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
ja L(large_memcpy_2x)
#endif
/* Entry if rdx is greater than non-temporal threshold but there
is overlap. */
/* To reach this point there cannot be overlap and dst > src. So
check for overlap and src > dst in which case correctness
requires forward copy. Otherwise decide between backward/forward
copy depending on address aliasing. */
/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
but less than __x86_shared_non_temporal_threshold. */
L(more_8x_vec_check):
cmpq %rsi, %rdi
ja L(more_8x_vec_backward)
/* Source == destination is less common. */
je L(nop)
/* Load the first VEC and last 4 * VEC to support overlapping
addresses. */
VMOVU (%rsi), %VEC(4)
/* rcx contains dst - src. Add back length (rdx). */
leaq (%rcx, %rdx), %r8
/* If r8 has different sign than rcx then there is overlap so we
must do forward copy. */
xorq %rcx, %r8
/* Isolate just sign bit of r8. */
shrq $63, %r8
/* Get 4k difference dst - src. */
andl $(PAGE_SIZE - 256), %ecx
/* If r8 is non-zero must do foward for correctness. Otherwise
if ecx is non-zero there is 4k False Alaising so do backward
copy. */
addl %r8d, %ecx
jz L(more_8x_vec_backward)
/* if rdx is greater than __x86_shared_non_temporal_threshold
but there is overlap, or from short distance movsb. */
L(more_8x_vec_forward):
/* Load first and last 4 * VEC to support overlapping addresses.
*/
/* First vec was already loaded into VEC(0). */
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
/* Save begining of dst. */
movq %rdi, %rcx
/* Align dst to VEC_SIZE - 1. */
orq $(VEC_SIZE - 1), %rdi
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
/* Save start and stop of the destination buffer. */
movq %rdi, %r11
leaq -VEC_SIZE(%rdi, %rdx), %rcx
/* Align destination for aligned stores in the loop. Compute
how much destination is misaligned. */
movq %rdi, %r8
andq $(VEC_SIZE - 1), %r8
/* Get the negative of offset for alignment. */
subq $VEC_SIZE, %r8
/* Adjust source. */
subq %r8, %rsi
/* Adjust destination which should be aligned now. */
subq %r8, %rdi
/* Adjust length. */
addq %r8, %rdx
.p2align 4
/* Subtract dst from src. Add back after dst aligned. */
subq %rcx, %rsi
/* Finish aligning dst. */
incq %rdi
/* Restore src adjusted with new value for aligned dst. */
addq %rdi, %rsi
/* Store end of buffer minus tail in rdx. */
leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx
/* Dont use multi-byte nop to align. */
.p2align 4,, 11
L(loop_4x_vec_forward):
/* Copy 4 * VEC a time forward. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
VMOVU (%rsi), %VEC(1)
VMOVU VEC_SIZE(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
subq $-(VEC_SIZE * 4), %rsi
addq $-(VEC_SIZE * 4), %rdx
VMOVA %VEC(0), (%rdi)
VMOVA %VEC(1), VEC_SIZE(%rdi)
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
VMOVA %VEC(1), (%rdi)
VMOVA %VEC(2), VEC_SIZE(%rdi)
VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi)
VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi)
subq $-(VEC_SIZE * 4), %rdi
cmpq $(VEC_SIZE * 4), %rdx
cmpq %rdi, %rdx
ja L(loop_4x_vec_forward)
/* Store the last 4 * VEC. */
VMOVU %VEC(5), (%rcx)
VMOVU %VEC(6), -VEC_SIZE(%rcx)
VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx)
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx)
VMOVU %VEC(7), VEC_SIZE(%rdx)
VMOVU %VEC(8), (%rdx)
/* Store the first VEC. */
VMOVU %VEC(4), (%r11)
VMOVU %VEC(0), (%rcx)
/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
*/
L(nop_backward):
VZEROUPPER_RETURN
.p2align 4,, 8
L(more_8x_vec_backward_check_nop):
/* rcx contains dst - src. Test for dst == src to skip all of
memmove. */
testq %rcx, %rcx
jz L(nop_backward)
L(more_8x_vec_backward):
/* Load the first 4 * VEC and last VEC to support overlapping
addresses. */
VMOVU (%rsi), %VEC(4)
/* First vec was also loaded into VEC(0). */
VMOVU VEC_SIZE(%rsi), %VEC(5)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
/* Begining of region for 4x backward copy stored in rcx. */
leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
/* Save stop of the destination buffer. */
leaq -VEC_SIZE(%rdi, %rdx), %r11
/* Align destination end for aligned stores in the loop. Compute
how much destination end is misaligned. */
leaq -VEC_SIZE(%rsi, %rdx), %rcx
movq %r11, %r9
movq %r11, %r8
andq $(VEC_SIZE - 1), %r8
/* Adjust source. */
subq %r8, %rcx
/* Adjust the end of destination which should be aligned now. */
subq %r8, %r9
/* Adjust length. */
subq %r8, %rdx
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8)
/* Subtract dst from src. Add back after dst aligned. */
subq %rdi, %rsi
/* Align dst. */
andq $-(VEC_SIZE), %rcx
/* Restore src. */
addq %rcx, %rsi
.p2align 4
/* Don't use multi-byte nop to align. */
.p2align 4,, 11
L(loop_4x_vec_backward):
/* Copy 4 * VEC a time backward. */
VMOVU (%rcx), %VEC(0)
VMOVU -VEC_SIZE(%rcx), %VEC(1)
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
addq $-(VEC_SIZE * 4), %rcx
addq $-(VEC_SIZE * 4), %rdx
VMOVA %VEC(0), (%r9)
VMOVA %VEC(1), -VEC_SIZE(%r9)
VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
addq $-(VEC_SIZE * 4), %r9
cmpq $(VEC_SIZE * 4), %rdx
ja L(loop_4x_vec_backward)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3)
VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4)
addq $(VEC_SIZE * -4), %rsi
VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx)
VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx)
VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx)
addq $(VEC_SIZE * -4), %rcx
cmpq %rcx, %rdi
jb L(loop_4x_vec_backward)
/* Store the first 4 * VEC. */
VMOVU %VEC(4), (%rdi)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(5), VEC_SIZE(%rdi)
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
/* Store the last VEC. */
VMOVU %VEC(8), (%r11)
VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi)
VZEROUPPER_RETURN
#if defined USE_MULTIARCH && IS_IN (libc)
/* L(skip_short_movsb_check) is only used with ERMS. Not for
FSRM. */
.p2align 5,, 16
# if ALIGN_MOVSB
L(skip_short_movsb_check):
# if MOVSB_ALIGN_TO > VEC_SIZE
VMOVU VEC_SIZE(%rsi), %VEC(1)
# endif
# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
# error Unsupported MOVSB_ALIGN_TO
# endif
/* If CPU does not have FSRM two options for aligning. Align src
if dst and src 4k alias. Otherwise align dst. */
testl $(PAGE_SIZE - 512), %ecx
jnz L(movsb_align_dst)
/* Fall through. dst and src 4k alias. It's better to align src
here because the bottleneck will be loads dues to the false
dependency on dst. */
/* rcx already has dst - src. */
movq %rcx, %r9
/* Add src to len. Subtract back after src aligned. -1 because
src is initially aligned to MOVSB_ALIGN_TO - 1. */
leaq -1(%rsi, %rdx), %rcx
/* Inclusively align src to MOVSB_ALIGN_TO - 1. */
orq $(MOVSB_ALIGN_TO - 1), %rsi
/* Restore dst and len adjusted with new values for aligned dst.
*/
leaq 1(%rsi, %r9), %rdi
subq %rsi, %rcx
/* Finish aligning src. */
incq %rsi
rep movsb
VMOVU %VEC(0), (%r8)
# if MOVSB_ALIGN_TO > VEC_SIZE
VMOVU %VEC(1), VEC_SIZE(%r8)
# endif
VZEROUPPER_RETURN
# endif
.p2align 4,, 12
L(movsb):
movq %rdi, %rcx
subq %rsi, %rcx
/* Go to backwards temporal copy if overlap no matter what as
backward REP MOVSB is slow and we don't want to use NT stores if
there is overlap. */
cmpq %rdx, %rcx
/* L(more_8x_vec_backward_check_nop) checks for src == dst. */
jb L(more_8x_vec_backward_check_nop)
# if ALIGN_MOVSB
/* Save dest for storing aligning VECs later. */
movq %rdi, %r8
# endif
/* If above __x86_rep_movsb_stop_threshold most likely is
candidate for NT moves aswell. */
cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
jae L(large_memcpy_2x_check)
# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
/* Only avoid short movsb if CPU has FSRM. */
testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
jz L(skip_short_movsb_check)
# if AVOID_SHORT_DISTANCE_REP_MOVSB
/* Avoid "rep movsb" if RCX, the distance between source and
destination, is N*4GB + [1..63] with N >= 0. */
/* ecx contains dst - src. Early check for backward copy
conditions means only case of slow movsb with src = dst + [0,
63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
for that case. */
cmpl $-64, %ecx
ja L(more_8x_vec_forward)
# endif
# endif
# if ALIGN_MOVSB
# if MOVSB_ALIGN_TO > VEC_SIZE
VMOVU VEC_SIZE(%rsi), %VEC(1)
# endif
# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
# error Unsupported MOVSB_ALIGN_TO
# endif
/* Fall through means cpu has FSRM. In that case exclusively
align destination. */
L(movsb_align_dst):
/* Subtract dst from src. Add back after dst aligned. */
subq %rdi, %rsi
/* Exclusively align dst to MOVSB_ALIGN_TO (64). */
addq $(MOVSB_ALIGN_TO - 1), %rdi
/* Add dst to len. Subtract back after dst aligned. */
leaq (%r8, %rdx), %rcx
/* Finish aligning dst. */
andq $-(MOVSB_ALIGN_TO), %rdi
/* Restore src and len adjusted with new values for aligned dst.
*/
addq %rdi, %rsi
subq %rdi, %rcx
rep movsb
/* Store VECs loaded for aligning. */
VMOVU %VEC(0), (%r8)
# if MOVSB_ALIGN_TO > VEC_SIZE
VMOVU %VEC(1), VEC_SIZE(%r8)
# endif
VZEROUPPER_RETURN
# else /* !ALIGN_MOVSB. */
L(skip_short_movsb_check):
mov %RDX_LP, %RCX_LP
rep movsb
ret
# endif
#endif
.p2align 4,, 10
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
.p2align 4
L(large_memcpy_2x_check):
cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
jb L(more_8x_vec_check)
L(large_memcpy_2x):
/* Compute absolute value of difference between source and
destination. */
movq %rdi, %r9
subq %rsi, %r9
movq %r9, %r8
leaq -1(%r9), %rcx
sarq $63, %r8
xorq %r8, %r9
subq %r8, %r9
/* Don't use non-temporal store if there is overlap between
destination and source since destination may be in cache when
source is loaded. */
cmpq %r9, %rdx
ja L(more_8x_vec_check)
/* To reach this point it is impossible for dst > src and
overlap. Remaining to check is src > dst and overlap. rcx
already contains dst - src. Negate rcx to get src - dst. If
length > rcx then there is overlap and forward copy is best. */
negq %rcx
cmpq %rcx, %rdx
ja L(more_8x_vec_forward)
/* Cache align destination. First store the first 64 bytes then
adjust alignments. */
VMOVU (%rsi), %VEC(8)
#if VEC_SIZE < 64
VMOVU VEC_SIZE(%rsi), %VEC(9)
#if VEC_SIZE < 32
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
#endif
#endif
VMOVU %VEC(8), (%rdi)
#if VEC_SIZE < 64
VMOVU %VEC(9), VEC_SIZE(%rdi)
#if VEC_SIZE < 32
VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
#endif
#endif
/* First vec was also loaded into VEC(0). */
# if VEC_SIZE < 64
VMOVU VEC_SIZE(%rsi), %VEC(1)
# if VEC_SIZE < 32
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
# endif
# endif
VMOVU %VEC(0), (%rdi)
# if VEC_SIZE < 64
VMOVU %VEC(1), VEC_SIZE(%rdi)
# if VEC_SIZE < 32
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
# endif
# endif
/* Adjust source, destination, and size. */
movq %rdi, %r8
andq $63, %r8
@ -614,9 +767,13 @@ L(large_memcpy_2x):
/* Adjust length. */
addq %r8, %rdx
/* Test if source and destination addresses will alias. If they do
the larger pipeline in large_memcpy_4x alleviated the
/* Test if source and destination addresses will alias. If they
do the larger pipeline in large_memcpy_4x alleviated the
performance drop. */
/* ecx contains -(dst - src). not ecx will return dst - src - 1
which works for testing aliasing. */
notl %ecx
testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
jz L(large_memcpy_4x)
@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
/* ecx stores inner loop counter. */
movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
L(loop_large_memcpy_4x_inner):
/* Only one prefetch set per page as doing 4 pages give more time
for prefetcher to keep up. */
/* Only one prefetch set per page as doing 4 pages give more
time for prefetcher to keep up. */
PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)