mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-12 22:30:12 +00:00
x86: Optimize memmove-vec-unaligned-erms.S
No bug.
The optimizations are as follows:
1) Always align entry to 64 bytes. This makes behavior more
predictable and makes other frontend optimizations easier.
2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
significant benefits in the case that:
0 < (dst - src) < [256, 512]
3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
improvement and for FSRM [-10%, 25%].
In addition to these primary changes there is general cleanup
throughout to optimize the aligning routines and control flow logic.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit a6b7502ec0
)
This commit is contained in:
parent
2e64237a87
commit
a7392db2ff
@ -25,7 +25,7 @@
|
||||
/* Use movups and movaps for smaller code sizes. */
|
||||
#define VMOVU movups
|
||||
#define VMOVA movaps
|
||||
|
||||
#define MOV_SIZE 3
|
||||
#define SECTION(p) p
|
||||
|
||||
#ifdef USE_MULTIARCH
|
||||
|
@ -4,7 +4,7 @@
|
||||
# define VMOVNT vmovntdq
|
||||
# define VMOVU vmovdqu
|
||||
# define VMOVA vmovdqa
|
||||
|
||||
# define MOV_SIZE 4
|
||||
# define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
# define VMOVNT vmovntdq
|
||||
# define VMOVU vmovdqu
|
||||
# define VMOVA vmovdqa
|
||||
|
||||
# define MOV_SIZE 4
|
||||
# define SECTION(p) p##.avx
|
||||
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s
|
||||
|
||||
|
@ -25,7 +25,7 @@
|
||||
# define VMOVU vmovdqu64
|
||||
# define VMOVA vmovdqa64
|
||||
# define VZEROUPPER
|
||||
|
||||
# define MOV_SIZE 6
|
||||
# define SECTION(p) p##.evex512
|
||||
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
|
||||
|
||||
|
@ -25,7 +25,7 @@
|
||||
# define VMOVU vmovdqu64
|
||||
# define VMOVA vmovdqa64
|
||||
# define VZEROUPPER
|
||||
|
||||
# define MOV_SIZE 6
|
||||
# define SECTION(p) p##.evex
|
||||
# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
|
||||
|
||||
|
@ -76,6 +76,25 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* Whether to align before movsb. Ultimately we want 64 byte
|
||||
align and not worth it to load 4x VEC for VEC_SIZE == 16. */
|
||||
#define ALIGN_MOVSB (VEC_SIZE > 16)
|
||||
/* Number of bytes to align movsb to. */
|
||||
#define MOVSB_ALIGN_TO 64
|
||||
|
||||
#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
|
||||
#define LARGE_MOV_SIZE (MOV_SIZE > 4)
|
||||
|
||||
#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
|
||||
# error MOV_SIZE Unknown
|
||||
#endif
|
||||
|
||||
#if LARGE_MOV_SIZE
|
||||
# define SMALL_SIZE_OFFSET (4)
|
||||
#else
|
||||
# define SMALL_SIZE_OFFSET (0)
|
||||
#endif
|
||||
|
||||
#ifndef PAGE_SIZE
|
||||
# define PAGE_SIZE 4096
|
||||
#endif
|
||||
@ -199,25 +218,21 @@ L(start):
|
||||
# endif
|
||||
cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
/* Load regardless. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(more_2x_vec)
|
||||
#if !defined USE_MULTIARCH || !IS_IN (libc)
|
||||
L(last_2x_vec):
|
||||
#endif
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
|
||||
#if !defined USE_MULTIARCH || !IS_IN (libc)
|
||||
L(nop):
|
||||
ret
|
||||
#if !(defined USE_MULTIARCH && IS_IN (libc))
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
#else
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
END (MEMMOVE_SYMBOL (__memmove, unaligned))
|
||||
|
||||
# if VEC_SIZE == 16
|
||||
ENTRY (__mempcpy_chk_erms)
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||
# endif
|
||||
|
||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
||||
ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
|
||||
movq %rdi, %rax
|
||||
L(start_erms):
|
||||
# ifdef __ILP32__
|
||||
@ -298,310 +313,448 @@ L(start_erms):
|
||||
# endif
|
||||
cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
/* Load regardless. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(movsb_more_2x_vec)
|
||||
L(last_2x_vec):
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
|
||||
*/
|
||||
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
|
||||
L(return):
|
||||
#if VEC_SIZE > 16
|
||||
# if VEC_SIZE > 16
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
#else
|
||||
ret
|
||||
#endif
|
||||
|
||||
L(movsb):
|
||||
cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
|
||||
jae L(more_8x_vec)
|
||||
cmpq %rsi, %rdi
|
||||
jb 1f
|
||||
/* Source == destination is less common. */
|
||||
je L(nop)
|
||||
leaq (%rsi,%rdx), %r9
|
||||
cmpq %r9, %rdi
|
||||
/* Avoid slow backward REP MOVSB. */
|
||||
jb L(more_8x_vec_backward)
|
||||
# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
||||
testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
|
||||
jz 3f
|
||||
movq %rdi, %rcx
|
||||
subq %rsi, %rcx
|
||||
jmp 2f
|
||||
# endif
|
||||
1:
|
||||
# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
||||
testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
|
||||
jz 3f
|
||||
movq %rsi, %rcx
|
||||
subq %rdi, %rcx
|
||||
2:
|
||||
/* Avoid "rep movsb" if RCX, the distance between source and destination,
|
||||
is N*4GB + [1..63] with N >= 0. */
|
||||
cmpl $63, %ecx
|
||||
jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
|
||||
3:
|
||||
# endif
|
||||
mov %RDX_LP, %RCX_LP
|
||||
rep movsb
|
||||
L(nop):
|
||||
# else
|
||||
ret
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if LARGE_MOV_SIZE
|
||||
/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
|
||||
ENTRY block and L(less_vec). */
|
||||
.p2align 4,, 8
|
||||
L(between_4_7):
|
||||
/* From 4 to 7. No branch when size == 4. */
|
||||
movl (%rsi), %ecx
|
||||
movl (%rsi, %rdx), %esi
|
||||
movl %ecx, (%rdi)
|
||||
movl %esi, (%rdi, %rdx)
|
||||
ret
|
||||
#endif
|
||||
|
||||
.p2align 4
|
||||
L(less_vec):
|
||||
/* Less than 1 VEC. */
|
||||
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
# error Unsupported VEC_SIZE!
|
||||
#endif
|
||||
#if VEC_SIZE > 32
|
||||
cmpb $32, %dl
|
||||
cmpl $32, %edx
|
||||
jae L(between_32_63)
|
||||
#endif
|
||||
#if VEC_SIZE > 16
|
||||
cmpb $16, %dl
|
||||
cmpl $16, %edx
|
||||
jae L(between_16_31)
|
||||
#endif
|
||||
cmpb $8, %dl
|
||||
cmpl $8, %edx
|
||||
jae L(between_8_15)
|
||||
cmpb $4, %dl
|
||||
#if SMALL_MOV_SIZE
|
||||
cmpl $4, %edx
|
||||
#else
|
||||
subq $4, %rdx
|
||||
#endif
|
||||
jae L(between_4_7)
|
||||
cmpb $1, %dl
|
||||
ja L(between_2_3)
|
||||
jb 1f
|
||||
movzbl (%rsi), %ecx
|
||||
cmpl $(1 - SMALL_SIZE_OFFSET), %edx
|
||||
jl L(copy_0)
|
||||
movb (%rsi), %cl
|
||||
je L(copy_1)
|
||||
movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
|
||||
movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
|
||||
L(copy_1):
|
||||
movb %cl, (%rdi)
|
||||
1:
|
||||
L(copy_0):
|
||||
ret
|
||||
|
||||
#if SMALL_MOV_SIZE
|
||||
.p2align 4,, 8
|
||||
L(between_4_7):
|
||||
/* From 4 to 7. No branch when size == 4. */
|
||||
movl -4(%rsi, %rdx), %ecx
|
||||
movl (%rsi), %esi
|
||||
movl %ecx, -4(%rdi, %rdx)
|
||||
movl %esi, (%rdi)
|
||||
ret
|
||||
#endif
|
||||
|
||||
#if VEC_SIZE > 16
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
.p2align 4,, 8
|
||||
L(between_16_31):
|
||||
vmovdqu (%rsi), %xmm0
|
||||
vmovdqu -16(%rsi, %rdx), %xmm1
|
||||
vmovdqu %xmm0, (%rdi)
|
||||
vmovdqu %xmm1, -16(%rdi, %rdx)
|
||||
/* No ymm registers have been touched. */
|
||||
ret
|
||||
#endif
|
||||
|
||||
#if VEC_SIZE > 32
|
||||
.p2align 4,, 10
|
||||
L(between_32_63):
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
VMOVU (%rsi), %YMM0
|
||||
VMOVU -32(%rsi,%rdx), %YMM1
|
||||
VMOVU -32(%rsi, %rdx), %YMM1
|
||||
VMOVU %YMM0, (%rdi)
|
||||
VMOVU %YMM1, -32(%rdi,%rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
#if VEC_SIZE > 16
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
L(between_16_31):
|
||||
VMOVU (%rsi), %XMM0
|
||||
VMOVU -16(%rsi,%rdx), %XMM1
|
||||
VMOVU %XMM0, (%rdi)
|
||||
VMOVU %XMM1, -16(%rdi,%rdx)
|
||||
VMOVU %YMM1, -32(%rdi, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
|
||||
.p2align 4,, 10
|
||||
L(between_8_15):
|
||||
/* From 8 to 15. No branch when size == 8. */
|
||||
movq -8(%rsi,%rdx), %rcx
|
||||
movq -8(%rsi, %rdx), %rcx
|
||||
movq (%rsi), %rsi
|
||||
movq %rcx, -8(%rdi,%rdx)
|
||||
movq %rsi, (%rdi)
|
||||
ret
|
||||
L(between_4_7):
|
||||
/* From 4 to 7. No branch when size == 4. */
|
||||
movl -4(%rsi,%rdx), %ecx
|
||||
movl (%rsi), %esi
|
||||
movl %ecx, -4(%rdi,%rdx)
|
||||
movl %esi, (%rdi)
|
||||
ret
|
||||
L(between_2_3):
|
||||
/* From 2 to 3. No branch when size == 2. */
|
||||
movzwl -2(%rsi,%rdx), %ecx
|
||||
movzwl (%rsi), %esi
|
||||
movw %cx, -2(%rdi,%rdx)
|
||||
movw %si, (%rdi)
|
||||
movq %rcx, -8(%rdi, %rdx)
|
||||
ret
|
||||
|
||||
.p2align 4,, 10
|
||||
L(last_4x_vec):
|
||||
/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
|
||||
|
||||
/* VEC(0) and VEC(1) have already been loaded. */
|
||||
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2)
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx)
|
||||
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
L(movsb_more_2x_vec):
|
||||
cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
|
||||
ja L(movsb)
|
||||
#endif
|
||||
L(more_2x_vec):
|
||||
/* More than 2 * VEC and there may be overlap between destination
|
||||
and source. */
|
||||
/* More than 2 * VEC and there may be overlap between
|
||||
destination and source. */
|
||||
cmpq $(VEC_SIZE * 8), %rdx
|
||||
ja L(more_8x_vec)
|
||||
/* Load VEC(1) regardless. VEC(0) has already been loaded. */
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
jbe L(last_4x_vec)
|
||||
/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
|
||||
VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
|
||||
VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
|
||||
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4)
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
|
||||
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
|
||||
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
|
||||
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
|
||||
VZEROUPPER_RETURN
|
||||
L(last_4x_vec):
|
||||
/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx)
|
||||
VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
|
||||
VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
|
||||
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4,, 4
|
||||
L(more_8x_vec):
|
||||
movq %rdi, %rcx
|
||||
subq %rsi, %rcx
|
||||
/* Go to backwards temporal copy if overlap no matter what as
|
||||
backward REP MOVSB is slow and we don't want to use NT stores if
|
||||
there is overlap. */
|
||||
cmpq %rdx, %rcx
|
||||
/* L(more_8x_vec_backward_check_nop) checks for src == dst. */
|
||||
jb L(more_8x_vec_backward_check_nop)
|
||||
/* Check if non-temporal move candidate. */
|
||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||||
/* Check non-temporal store threshold. */
|
||||
cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||
cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||
ja L(large_memcpy_2x)
|
||||
#endif
|
||||
/* Entry if rdx is greater than non-temporal threshold but there
|
||||
is overlap. */
|
||||
/* To reach this point there cannot be overlap and dst > src. So
|
||||
check for overlap and src > dst in which case correctness
|
||||
requires forward copy. Otherwise decide between backward/forward
|
||||
copy depending on address aliasing. */
|
||||
|
||||
/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
|
||||
but less than __x86_shared_non_temporal_threshold. */
|
||||
L(more_8x_vec_check):
|
||||
cmpq %rsi, %rdi
|
||||
ja L(more_8x_vec_backward)
|
||||
/* Source == destination is less common. */
|
||||
je L(nop)
|
||||
/* Load the first VEC and last 4 * VEC to support overlapping
|
||||
addresses. */
|
||||
VMOVU (%rsi), %VEC(4)
|
||||
/* rcx contains dst - src. Add back length (rdx). */
|
||||
leaq (%rcx, %rdx), %r8
|
||||
/* If r8 has different sign than rcx then there is overlap so we
|
||||
must do forward copy. */
|
||||
xorq %rcx, %r8
|
||||
/* Isolate just sign bit of r8. */
|
||||
shrq $63, %r8
|
||||
/* Get 4k difference dst - src. */
|
||||
andl $(PAGE_SIZE - 256), %ecx
|
||||
/* If r8 is non-zero must do foward for correctness. Otherwise
|
||||
if ecx is non-zero there is 4k False Alaising so do backward
|
||||
copy. */
|
||||
addl %r8d, %ecx
|
||||
jz L(more_8x_vec_backward)
|
||||
|
||||
/* if rdx is greater than __x86_shared_non_temporal_threshold
|
||||
but there is overlap, or from short distance movsb. */
|
||||
L(more_8x_vec_forward):
|
||||
/* Load first and last 4 * VEC to support overlapping addresses.
|
||||
*/
|
||||
|
||||
/* First vec was already loaded into VEC(0). */
|
||||
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
|
||||
/* Save begining of dst. */
|
||||
movq %rdi, %rcx
|
||||
/* Align dst to VEC_SIZE - 1. */
|
||||
orq $(VEC_SIZE - 1), %rdi
|
||||
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
|
||||
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
|
||||
/* Save start and stop of the destination buffer. */
|
||||
movq %rdi, %r11
|
||||
leaq -VEC_SIZE(%rdi, %rdx), %rcx
|
||||
/* Align destination for aligned stores in the loop. Compute
|
||||
how much destination is misaligned. */
|
||||
movq %rdi, %r8
|
||||
andq $(VEC_SIZE - 1), %r8
|
||||
/* Get the negative of offset for alignment. */
|
||||
subq $VEC_SIZE, %r8
|
||||
/* Adjust source. */
|
||||
subq %r8, %rsi
|
||||
/* Adjust destination which should be aligned now. */
|
||||
subq %r8, %rdi
|
||||
/* Adjust length. */
|
||||
addq %r8, %rdx
|
||||
|
||||
.p2align 4
|
||||
/* Subtract dst from src. Add back after dst aligned. */
|
||||
subq %rcx, %rsi
|
||||
/* Finish aligning dst. */
|
||||
incq %rdi
|
||||
/* Restore src adjusted with new value for aligned dst. */
|
||||
addq %rdi, %rsi
|
||||
/* Store end of buffer minus tail in rdx. */
|
||||
leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx
|
||||
|
||||
/* Dont use multi-byte nop to align. */
|
||||
.p2align 4,, 11
|
||||
L(loop_4x_vec_forward):
|
||||
/* Copy 4 * VEC a time forward. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||||
VMOVU (%rsi), %VEC(1)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(2)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
|
||||
subq $-(VEC_SIZE * 4), %rsi
|
||||
addq $-(VEC_SIZE * 4), %rdx
|
||||
VMOVA %VEC(0), (%rdi)
|
||||
VMOVA %VEC(1), VEC_SIZE(%rdi)
|
||||
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
VMOVA %VEC(1), (%rdi)
|
||||
VMOVA %VEC(2), VEC_SIZE(%rdi)
|
||||
VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi)
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
cmpq %rdi, %rdx
|
||||
ja L(loop_4x_vec_forward)
|
||||
/* Store the last 4 * VEC. */
|
||||
VMOVU %VEC(5), (%rcx)
|
||||
VMOVU %VEC(6), -VEC_SIZE(%rcx)
|
||||
VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
|
||||
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
|
||||
VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx)
|
||||
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx)
|
||||
VMOVU %VEC(7), VEC_SIZE(%rdx)
|
||||
VMOVU %VEC(8), (%rdx)
|
||||
/* Store the first VEC. */
|
||||
VMOVU %VEC(4), (%r11)
|
||||
VMOVU %VEC(0), (%rcx)
|
||||
/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
|
||||
*/
|
||||
L(nop_backward):
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4,, 8
|
||||
L(more_8x_vec_backward_check_nop):
|
||||
/* rcx contains dst - src. Test for dst == src to skip all of
|
||||
memmove. */
|
||||
testq %rcx, %rcx
|
||||
jz L(nop_backward)
|
||||
L(more_8x_vec_backward):
|
||||
/* Load the first 4 * VEC and last VEC to support overlapping
|
||||
addresses. */
|
||||
VMOVU (%rsi), %VEC(4)
|
||||
|
||||
/* First vec was also loaded into VEC(0). */
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(5)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
|
||||
/* Begining of region for 4x backward copy stored in rcx. */
|
||||
leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
|
||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
|
||||
/* Save stop of the destination buffer. */
|
||||
leaq -VEC_SIZE(%rdi, %rdx), %r11
|
||||
/* Align destination end for aligned stores in the loop. Compute
|
||||
how much destination end is misaligned. */
|
||||
leaq -VEC_SIZE(%rsi, %rdx), %rcx
|
||||
movq %r11, %r9
|
||||
movq %r11, %r8
|
||||
andq $(VEC_SIZE - 1), %r8
|
||||
/* Adjust source. */
|
||||
subq %r8, %rcx
|
||||
/* Adjust the end of destination which should be aligned now. */
|
||||
subq %r8, %r9
|
||||
/* Adjust length. */
|
||||
subq %r8, %rdx
|
||||
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8)
|
||||
/* Subtract dst from src. Add back after dst aligned. */
|
||||
subq %rdi, %rsi
|
||||
/* Align dst. */
|
||||
andq $-(VEC_SIZE), %rcx
|
||||
/* Restore src. */
|
||||
addq %rcx, %rsi
|
||||
|
||||
.p2align 4
|
||||
/* Don't use multi-byte nop to align. */
|
||||
.p2align 4,, 11
|
||||
L(loop_4x_vec_backward):
|
||||
/* Copy 4 * VEC a time backward. */
|
||||
VMOVU (%rcx), %VEC(0)
|
||||
VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
||||
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
||||
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
||||
addq $-(VEC_SIZE * 4), %rcx
|
||||
addq $-(VEC_SIZE * 4), %rdx
|
||||
VMOVA %VEC(0), (%r9)
|
||||
VMOVA %VEC(1), -VEC_SIZE(%r9)
|
||||
VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
|
||||
VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
|
||||
addq $-(VEC_SIZE * 4), %r9
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
ja L(loop_4x_vec_backward)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3)
|
||||
VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4)
|
||||
addq $(VEC_SIZE * -4), %rsi
|
||||
VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx)
|
||||
VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
|
||||
VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx)
|
||||
VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx)
|
||||
addq $(VEC_SIZE * -4), %rcx
|
||||
cmpq %rcx, %rdi
|
||||
jb L(loop_4x_vec_backward)
|
||||
/* Store the first 4 * VEC. */
|
||||
VMOVU %VEC(4), (%rdi)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(5), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
|
||||
/* Store the last VEC. */
|
||||
VMOVU %VEC(8), (%r11)
|
||||
VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi)
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
/* L(skip_short_movsb_check) is only used with ERMS. Not for
|
||||
FSRM. */
|
||||
.p2align 5,, 16
|
||||
# if ALIGN_MOVSB
|
||||
L(skip_short_movsb_check):
|
||||
# if MOVSB_ALIGN_TO > VEC_SIZE
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
# endif
|
||||
# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
|
||||
# error Unsupported MOVSB_ALIGN_TO
|
||||
# endif
|
||||
/* If CPU does not have FSRM two options for aligning. Align src
|
||||
if dst and src 4k alias. Otherwise align dst. */
|
||||
testl $(PAGE_SIZE - 512), %ecx
|
||||
jnz L(movsb_align_dst)
|
||||
/* Fall through. dst and src 4k alias. It's better to align src
|
||||
here because the bottleneck will be loads dues to the false
|
||||
dependency on dst. */
|
||||
|
||||
/* rcx already has dst - src. */
|
||||
movq %rcx, %r9
|
||||
/* Add src to len. Subtract back after src aligned. -1 because
|
||||
src is initially aligned to MOVSB_ALIGN_TO - 1. */
|
||||
leaq -1(%rsi, %rdx), %rcx
|
||||
/* Inclusively align src to MOVSB_ALIGN_TO - 1. */
|
||||
orq $(MOVSB_ALIGN_TO - 1), %rsi
|
||||
/* Restore dst and len adjusted with new values for aligned dst.
|
||||
*/
|
||||
leaq 1(%rsi, %r9), %rdi
|
||||
subq %rsi, %rcx
|
||||
/* Finish aligning src. */
|
||||
incq %rsi
|
||||
|
||||
rep movsb
|
||||
|
||||
VMOVU %VEC(0), (%r8)
|
||||
# if MOVSB_ALIGN_TO > VEC_SIZE
|
||||
VMOVU %VEC(1), VEC_SIZE(%r8)
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
# endif
|
||||
|
||||
.p2align 4,, 12
|
||||
L(movsb):
|
||||
movq %rdi, %rcx
|
||||
subq %rsi, %rcx
|
||||
/* Go to backwards temporal copy if overlap no matter what as
|
||||
backward REP MOVSB is slow and we don't want to use NT stores if
|
||||
there is overlap. */
|
||||
cmpq %rdx, %rcx
|
||||
/* L(more_8x_vec_backward_check_nop) checks for src == dst. */
|
||||
jb L(more_8x_vec_backward_check_nop)
|
||||
# if ALIGN_MOVSB
|
||||
/* Save dest for storing aligning VECs later. */
|
||||
movq %rdi, %r8
|
||||
# endif
|
||||
/* If above __x86_rep_movsb_stop_threshold most likely is
|
||||
candidate for NT moves aswell. */
|
||||
cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
|
||||
jae L(large_memcpy_2x_check)
|
||||
# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
|
||||
/* Only avoid short movsb if CPU has FSRM. */
|
||||
testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
|
||||
jz L(skip_short_movsb_check)
|
||||
# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
||||
/* Avoid "rep movsb" if RCX, the distance between source and
|
||||
destination, is N*4GB + [1..63] with N >= 0. */
|
||||
|
||||
/* ecx contains dst - src. Early check for backward copy
|
||||
conditions means only case of slow movsb with src = dst + [0,
|
||||
63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
|
||||
for that case. */
|
||||
cmpl $-64, %ecx
|
||||
ja L(more_8x_vec_forward)
|
||||
# endif
|
||||
# endif
|
||||
# if ALIGN_MOVSB
|
||||
# if MOVSB_ALIGN_TO > VEC_SIZE
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
# endif
|
||||
# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
|
||||
# error Unsupported MOVSB_ALIGN_TO
|
||||
# endif
|
||||
/* Fall through means cpu has FSRM. In that case exclusively
|
||||
align destination. */
|
||||
L(movsb_align_dst):
|
||||
/* Subtract dst from src. Add back after dst aligned. */
|
||||
subq %rdi, %rsi
|
||||
/* Exclusively align dst to MOVSB_ALIGN_TO (64). */
|
||||
addq $(MOVSB_ALIGN_TO - 1), %rdi
|
||||
/* Add dst to len. Subtract back after dst aligned. */
|
||||
leaq (%r8, %rdx), %rcx
|
||||
/* Finish aligning dst. */
|
||||
andq $-(MOVSB_ALIGN_TO), %rdi
|
||||
/* Restore src and len adjusted with new values for aligned dst.
|
||||
*/
|
||||
addq %rdi, %rsi
|
||||
subq %rdi, %rcx
|
||||
|
||||
rep movsb
|
||||
|
||||
/* Store VECs loaded for aligning. */
|
||||
VMOVU %VEC(0), (%r8)
|
||||
# if MOVSB_ALIGN_TO > VEC_SIZE
|
||||
VMOVU %VEC(1), VEC_SIZE(%r8)
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
# else /* !ALIGN_MOVSB. */
|
||||
L(skip_short_movsb_check):
|
||||
mov %RDX_LP, %RCX_LP
|
||||
rep movsb
|
||||
ret
|
||||
# endif
|
||||
#endif
|
||||
|
||||
.p2align 4,, 10
|
||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||||
.p2align 4
|
||||
L(large_memcpy_2x_check):
|
||||
cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
|
||||
jb L(more_8x_vec_check)
|
||||
L(large_memcpy_2x):
|
||||
/* Compute absolute value of difference between source and
|
||||
destination. */
|
||||
movq %rdi, %r9
|
||||
subq %rsi, %r9
|
||||
movq %r9, %r8
|
||||
leaq -1(%r9), %rcx
|
||||
sarq $63, %r8
|
||||
xorq %r8, %r9
|
||||
subq %r8, %r9
|
||||
/* Don't use non-temporal store if there is overlap between
|
||||
destination and source since destination may be in cache when
|
||||
source is loaded. */
|
||||
cmpq %r9, %rdx
|
||||
ja L(more_8x_vec_check)
|
||||
/* To reach this point it is impossible for dst > src and
|
||||
overlap. Remaining to check is src > dst and overlap. rcx
|
||||
already contains dst - src. Negate rcx to get src - dst. If
|
||||
length > rcx then there is overlap and forward copy is best. */
|
||||
negq %rcx
|
||||
cmpq %rcx, %rdx
|
||||
ja L(more_8x_vec_forward)
|
||||
|
||||
/* Cache align destination. First store the first 64 bytes then
|
||||
adjust alignments. */
|
||||
VMOVU (%rsi), %VEC(8)
|
||||
#if VEC_SIZE < 64
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(9)
|
||||
#if VEC_SIZE < 32
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
|
||||
#endif
|
||||
#endif
|
||||
VMOVU %VEC(8), (%rdi)
|
||||
#if VEC_SIZE < 64
|
||||
VMOVU %VEC(9), VEC_SIZE(%rdi)
|
||||
#if VEC_SIZE < 32
|
||||
VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* First vec was also loaded into VEC(0). */
|
||||
# if VEC_SIZE < 64
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
# if VEC_SIZE < 32
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||||
# endif
|
||||
# endif
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
# if VEC_SIZE < 64
|
||||
VMOVU %VEC(1), VEC_SIZE(%rdi)
|
||||
# if VEC_SIZE < 32
|
||||
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
# endif
|
||||
# endif
|
||||
|
||||
/* Adjust source, destination, and size. */
|
||||
movq %rdi, %r8
|
||||
andq $63, %r8
|
||||
@ -614,9 +767,13 @@ L(large_memcpy_2x):
|
||||
/* Adjust length. */
|
||||
addq %r8, %rdx
|
||||
|
||||
/* Test if source and destination addresses will alias. If they do
|
||||
the larger pipeline in large_memcpy_4x alleviated the
|
||||
/* Test if source and destination addresses will alias. If they
|
||||
do the larger pipeline in large_memcpy_4x alleviated the
|
||||
performance drop. */
|
||||
|
||||
/* ecx contains -(dst - src). not ecx will return dst - src - 1
|
||||
which works for testing aliasing. */
|
||||
notl %ecx
|
||||
testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
|
||||
jz L(large_memcpy_4x)
|
||||
|
||||
@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
|
||||
/* ecx stores inner loop counter. */
|
||||
movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
|
||||
L(loop_large_memcpy_4x_inner):
|
||||
/* Only one prefetch set per page as doing 4 pages give more time
|
||||
for prefetcher to keep up. */
|
||||
/* Only one prefetch set per page as doing 4 pages give more
|
||||
time for prefetcher to keep up. */
|
||||
PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
|
||||
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
|
||||
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
|
||||
|
Loading…
Reference in New Issue
Block a user