x86: Update large memcpy case in memmove-vec-unaligned-erms.S

No Bug. This commit updates the large memcpy case (no overlap). The
update is to perform memcpy on either 2 or 4 contiguous pages at
once. This 1) helps to alleviate the affects of false memory aliasing
when destination and source have a close 4k alignment and 2) In most
cases and for most DRAM units is a modestly more efficient access
pattern. These changes are a clear performance improvement for
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
pass.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
This commit is contained in:
noah 2021-04-03 04:12:15 -04:00 committed by H.J. Lu
parent 5d61fc2021
commit 1a8605b6cd

View File

@ -35,7 +35,16 @@
__x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
7. If size >= __x86_shared_non_temporal_threshold and there is no
overlap between destination and source, use non-temporal store
instead of aligned store. */
instead of aligned store copying from either 2 or 4 pages at
once.
8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
and source and destination do not page alias, copy from 2 pages
at once using non-temporal stores. Page aliasing in this case is
considered true if destination's page alignment - sources' page
alignment is less than 8 * VEC_SIZE.
9. If size >= 16 * __x86_shared_non_temporal_threshold or source
and destination do page alias copy from 4 pages at once using
non-temporal stores. */
#include <sysdep.h>
@ -67,6 +76,34 @@
# endif
#endif
#ifndef PAGE_SIZE
# define PAGE_SIZE 4096
#endif
#if PAGE_SIZE != 4096
# error Unsupported PAGE_SIZE
#endif
#ifndef LOG_PAGE_SIZE
# define LOG_PAGE_SIZE 12
#endif
#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
# error Invalid LOG_PAGE_SIZE
#endif
/* Byte per page for large_memcpy inner loop. */
#if VEC_SIZE == 64
# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
#else
# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
#endif
/* Amount to shift rdx by to compare for memcpy_large_4x. */
#ifndef LOG_4X_MEMCPY_THRESH
# define LOG_4X_MEMCPY_THRESH 4
#endif
/* Avoid short distance rep movsb only with non-SSE vector. */
#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
@ -106,6 +143,28 @@
# error Unsupported PREFETCH_SIZE!
#endif
#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
VMOVU (offset)base, vec0; \
VMOVU ((offset) + VEC_SIZE)base, vec1;
# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
VMOVNT vec0, (offset)base; \
VMOVNT vec1, ((offset) + VEC_SIZE)base;
#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
VMOVU (offset)base, vec0; \
VMOVU ((offset) + VEC_SIZE)base, vec1; \
VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
VMOVNT vec0, (offset)base; \
VMOVNT vec1, ((offset) + VEC_SIZE)base; \
VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
#else
# error Invalid LARGE_LOAD_SIZE
#endif
#ifndef SECTION
# error SECTION is not defined!
#endif
@ -393,6 +452,15 @@ L(last_4x_vec):
VZEROUPPER_RETURN
L(more_8x_vec):
/* Check if non-temporal move candidate. */
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
/* Check non-temporal store threshold. */
cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
ja L(large_memcpy_2x)
#endif
/* Entry if rdx is greater than non-temporal threshold but there
is overlap. */
L(more_8x_vec_check):
cmpq %rsi, %rdi
ja L(more_8x_vec_backward)
/* Source == destination is less common. */
@ -419,24 +487,21 @@ L(more_8x_vec):
subq %r8, %rdi
/* Adjust length. */
addq %r8, %rdx
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
/* Check non-temporal store threshold. */
cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
ja L(large_forward)
#endif
.p2align 4
L(loop_4x_vec_forward):
/* Copy 4 * VEC a time forward. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
addq $(VEC_SIZE * 4), %rsi
subq $(VEC_SIZE * 4), %rdx
subq $-(VEC_SIZE * 4), %rsi
addq $-(VEC_SIZE * 4), %rdx
VMOVA %VEC(0), (%rdi)
VMOVA %VEC(1), VEC_SIZE(%rdi)
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
addq $(VEC_SIZE * 4), %rdi
subq $-(VEC_SIZE * 4), %rdi
cmpq $(VEC_SIZE * 4), %rdx
ja L(loop_4x_vec_forward)
/* Store the last 4 * VEC. */
@ -470,24 +535,21 @@ L(more_8x_vec_backward):
subq %r8, %r9
/* Adjust length. */
subq %r8, %rdx
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
/* Check non-temporal store threshold. */
cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
ja L(large_backward)
#endif
.p2align 4
L(loop_4x_vec_backward):
/* Copy 4 * VEC a time backward. */
VMOVU (%rcx), %VEC(0)
VMOVU -VEC_SIZE(%rcx), %VEC(1)
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
subq $(VEC_SIZE * 4), %rcx
subq $(VEC_SIZE * 4), %rdx
addq $-(VEC_SIZE * 4), %rcx
addq $-(VEC_SIZE * 4), %rdx
VMOVA %VEC(0), (%r9)
VMOVA %VEC(1), -VEC_SIZE(%r9)
VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
subq $(VEC_SIZE * 4), %r9
addq $-(VEC_SIZE * 4), %r9
cmpq $(VEC_SIZE * 4), %rdx
ja L(loop_4x_vec_backward)
/* Store the first 4 * VEC. */
@ -500,72 +562,202 @@ L(loop_4x_vec_backward):
VZEROUPPER_RETURN
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
L(large_forward):
.p2align 4
L(large_memcpy_2x):
/* Compute absolute value of difference between source and
destination. */
movq %rdi, %r9
subq %rsi, %r9
movq %r9, %r8
leaq -1(%r9), %rcx
sarq $63, %r8
xorq %r8, %r9
subq %r8, %r9
/* Don't use non-temporal store if there is overlap between
destination and source since destination may be in cache
when source is loaded. */
leaq (%rdi, %rdx), %r10
cmpq %r10, %rsi
jb L(loop_4x_vec_forward)
L(loop_large_forward):
destination and source since destination may be in cache when
source is loaded. */
cmpq %r9, %rdx
ja L(more_8x_vec_check)
/* Cache align destination. First store the first 64 bytes then
adjust alignments. */
VMOVU (%rsi), %VEC(8)
#if VEC_SIZE < 64
VMOVU VEC_SIZE(%rsi), %VEC(9)
#if VEC_SIZE < 32
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
#endif
#endif
VMOVU %VEC(8), (%rdi)
#if VEC_SIZE < 64
VMOVU %VEC(9), VEC_SIZE(%rdi)
#if VEC_SIZE < 32
VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
#endif
#endif
/* Adjust source, destination, and size. */
movq %rdi, %r8
andq $63, %r8
/* Get the negative of offset for alignment. */
subq $64, %r8
/* Adjust source. */
subq %r8, %rsi
/* Adjust destination which should be aligned now. */
subq %r8, %rdi
/* Adjust length. */
addq %r8, %rdx
/* Test if source and destination addresses will alias. If they do
the larger pipeline in large_memcpy_4x alleviated the
performance drop. */
testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
jz L(large_memcpy_4x)
movq %rdx, %r10
shrq $LOG_4X_MEMCPY_THRESH, %r10
cmp __x86_shared_non_temporal_threshold(%rip), %r10
jae L(large_memcpy_4x)
/* edx will store remainder size for copying tail. */
andl $(PAGE_SIZE * 2 - 1), %edx
/* r10 stores outer loop counter. */
shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
/* Copy 4x VEC at a time from 2 pages. */
.p2align 4
L(loop_large_memcpy_2x_outer):
/* ecx stores inner loop counter. */
movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
L(loop_large_memcpy_2x_inner):
PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
/* Load vectors from rsi. */
LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
subq $-LARGE_LOAD_SIZE, %rsi
/* Non-temporal store vectors to rdi. */
STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
subq $-LARGE_LOAD_SIZE, %rdi
decl %ecx
jnz L(loop_large_memcpy_2x_inner)
addq $PAGE_SIZE, %rdi
addq $PAGE_SIZE, %rsi
decq %r10
jne L(loop_large_memcpy_2x_outer)
sfence
/* Check if only last 4 loads are needed. */
cmpl $(VEC_SIZE * 4), %edx
jbe L(large_memcpy_2x_end)
/* Handle the last 2 * PAGE_SIZE bytes. */
L(loop_large_memcpy_2x_tail):
/* Copy 4 * VEC a time forward with non-temporal stores. */
PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
addq $PREFETCHED_LOAD_SIZE, %rsi
subq $PREFETCHED_LOAD_SIZE, %rdx
VMOVNT %VEC(0), (%rdi)
VMOVNT %VEC(1), VEC_SIZE(%rdi)
VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
addq $PREFETCHED_LOAD_SIZE, %rdi
cmpq $PREFETCHED_LOAD_SIZE, %rdx
ja L(loop_large_forward)
sfence
subq $-(VEC_SIZE * 4), %rsi
addl $-(VEC_SIZE * 4), %edx
VMOVA %VEC(0), (%rdi)
VMOVA %VEC(1), VEC_SIZE(%rdi)
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
subq $-(VEC_SIZE * 4), %rdi
cmpl $(VEC_SIZE * 4), %edx
ja L(loop_large_memcpy_2x_tail)
L(large_memcpy_2x_end):
/* Store the last 4 * VEC. */
VMOVU %VEC(5), (%rcx)
VMOVU %VEC(6), -VEC_SIZE(%rcx)
VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
/* Store the first VEC. */
VMOVU %VEC(4), (%r11)
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
VZEROUPPER_RETURN
L(large_backward):
/* Don't use non-temporal store if there is overlap between
destination and source since destination may be in cache
when source is loaded. */
leaq (%rcx, %rdx), %r10
cmpq %r10, %r9
jb L(loop_4x_vec_backward)
L(loop_large_backward):
/* Copy 4 * VEC a time backward with non-temporal stores. */
PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
VMOVU (%rcx), %VEC(0)
VMOVU -VEC_SIZE(%rcx), %VEC(1)
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
subq $PREFETCHED_LOAD_SIZE, %rcx
subq $PREFETCHED_LOAD_SIZE, %rdx
VMOVNT %VEC(0), (%r9)
VMOVNT %VEC(1), -VEC_SIZE(%r9)
VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
subq $PREFETCHED_LOAD_SIZE, %r9
cmpq $PREFETCHED_LOAD_SIZE, %rdx
ja L(loop_large_backward)
.p2align 4
L(large_memcpy_4x):
movq %rdx, %r10
/* edx will store remainder size for copying tail. */
andl $(PAGE_SIZE * 4 - 1), %edx
/* r10 stores outer loop counter. */
shrq $(LOG_PAGE_SIZE + 2), %r10
/* Copy 4x VEC at a time from 4 pages. */
.p2align 4
L(loop_large_memcpy_4x_outer):
/* ecx stores inner loop counter. */
movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
L(loop_large_memcpy_4x_inner):
/* Only one prefetch set per page as doing 4 pages give more time
for prefetcher to keep up. */
PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
/* Load vectors from rsi. */
LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
subq $-LARGE_LOAD_SIZE, %rsi
/* Non-temporal store vectors to rdi. */
STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
subq $-LARGE_LOAD_SIZE, %rdi
decl %ecx
jnz L(loop_large_memcpy_4x_inner)
addq $(PAGE_SIZE * 3), %rdi
addq $(PAGE_SIZE * 3), %rsi
decq %r10
jne L(loop_large_memcpy_4x_outer)
sfence
/* Store the first 4 * VEC. */
VMOVU %VEC(4), (%rdi)
VMOVU %VEC(5), VEC_SIZE(%rdi)
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
/* Store the last VEC. */
VMOVU %VEC(8), (%r11)
/* Check if only last 4 loads are needed. */
cmpl $(VEC_SIZE * 4), %edx
jbe L(large_memcpy_4x_end)
/* Handle the last 4 * PAGE_SIZE bytes. */
L(loop_large_memcpy_4x_tail):
/* Copy 4 * VEC a time forward with non-temporal stores. */
PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
subq $-(VEC_SIZE * 4), %rsi
addl $-(VEC_SIZE * 4), %edx
VMOVA %VEC(0), (%rdi)
VMOVA %VEC(1), VEC_SIZE(%rdi)
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
subq $-(VEC_SIZE * 4), %rdi
cmpl $(VEC_SIZE * 4), %edx
ja L(loop_large_memcpy_4x_tail)
L(large_memcpy_4x_end):
/* Store the last 4 * VEC. */
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))