mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-22 13:00:06 +00:00
x86: Reduce code size of mem{move|pcpy|cpy}-ssse3
The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are generally preferable. memcpy/memmove is one exception where avoiding unaligned loads with `palignr` is important for some targets. This commit replaces memmove-ssse3 with a better optimized are lower code footprint verion. As well it aliases memcpy to memmove. Aside from this function all other SSSE3 functions should be safe to remove. The performance is not changed drastically although shows overall improvements without any major regressions or gains. bench-memcpy geometric_mean(N=50) New / Original: 0.957 bench-memcpy-random geometric_mean(N=50) New / Original: 0.912 bench-memcpy-large geometric_mean(N=50) New / Original: 0.892 Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers for all results. More important this saves 7246 bytes of code size in memmove an additional 10741 bytes by reusing memmove code for memcpy (total 17987 bytes saves). As well an additional 896 bytes of rodata for the jump table entries.
This commit is contained in:
parent
d85916e30a
commit
26b2478322
@ -16,7 +16,6 @@ sysdep_routines += \
|
||||
memcmpeq-avx2-rtm \
|
||||
memcmpeq-evex \
|
||||
memcmpeq-sse2 \
|
||||
memcpy-ssse3 \
|
||||
memmove-avx-unaligned-erms \
|
||||
memmove-avx-unaligned-erms-rtm \
|
||||
memmove-avx512-no-vzeroupper \
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,380 @@
|
||||
#define USE_AS_MEMMOVE
|
||||
#define MEMCPY __memmove_ssse3
|
||||
#define MEMCPY_CHK __memmove_chk_ssse3
|
||||
#include "memcpy-ssse3.S"
|
||||
#include <sysdep.h>
|
||||
|
||||
#ifndef MEMMOVE
|
||||
# define MEMMOVE __memmove_ssse3
|
||||
# define MEMMOVE_CHK __memmove_chk_ssse3
|
||||
# define MEMCPY __memcpy_ssse3
|
||||
# define MEMCPY_CHK __memcpy_chk_ssse3
|
||||
# define MEMPCPY __mempcpy_ssse3
|
||||
# define MEMPCPY_CHK __mempcpy_chk_ssse3
|
||||
#endif
|
||||
|
||||
.section .text.ssse3, "ax", @progbits
|
||||
ENTRY(MEMPCPY_CHK)
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET(__chk_fail)
|
||||
END(MEMPCPY_CHK)
|
||||
|
||||
ENTRY(MEMPCPY)
|
||||
mov %RDI_LP, %RAX_LP
|
||||
add %RDX_LP, %RAX_LP
|
||||
jmp L(start)
|
||||
END(MEMPCPY)
|
||||
|
||||
ENTRY(MEMMOVE_CHK)
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET(__chk_fail)
|
||||
END(MEMMOVE_CHK)
|
||||
|
||||
ENTRY_P2ALIGN(MEMMOVE, 6)
|
||||
movq %rdi, %rax
|
||||
L(start):
|
||||
cmpq $16, %rdx
|
||||
jb L(copy_0_15)
|
||||
|
||||
/* These loads are always useful. */
|
||||
movups 0(%rsi), %xmm0
|
||||
movups -16(%rsi, %rdx), %xmm7
|
||||
cmpq $32, %rdx
|
||||
ja L(more_2x_vec)
|
||||
|
||||
movups %xmm0, 0(%rdi)
|
||||
movups %xmm7, -16(%rdi, %rdx)
|
||||
ret
|
||||
|
||||
.p2align 4,, 4
|
||||
L(copy_0_15):
|
||||
cmpl $4, %edx
|
||||
jb L(copy_0_3)
|
||||
cmpl $8, %edx
|
||||
jb L(copy_4_7)
|
||||
movq 0(%rsi), %rcx
|
||||
movq -8(%rsi, %rdx), %rsi
|
||||
movq %rcx, 0(%rdi)
|
||||
movq %rsi, -8(%rdi, %rdx)
|
||||
ret
|
||||
|
||||
.p2align 4,, 4
|
||||
L(copy_4_7):
|
||||
movl 0(%rsi), %ecx
|
||||
movl -4(%rsi, %rdx), %esi
|
||||
movl %ecx, 0(%rdi)
|
||||
movl %esi, -4(%rdi, %rdx)
|
||||
ret
|
||||
|
||||
.p2align 4,, 4
|
||||
L(copy_0_3):
|
||||
decl %edx
|
||||
jl L(copy_0_0)
|
||||
movb (%rsi), %cl
|
||||
je L(copy_1_1)
|
||||
|
||||
movzwl -1(%rsi, %rdx), %esi
|
||||
movw %si, -1(%rdi, %rdx)
|
||||
L(copy_1_1):
|
||||
movb %cl, (%rdi)
|
||||
L(copy_0_0):
|
||||
ret
|
||||
|
||||
.p2align 4,, 4
|
||||
L(copy_4x_vec):
|
||||
movups 16(%rsi), %xmm1
|
||||
movups -32(%rsi, %rdx), %xmm2
|
||||
|
||||
movups %xmm0, 0(%rdi)
|
||||
movups %xmm1, 16(%rdi)
|
||||
movups %xmm2, -32(%rdi, %rdx)
|
||||
movups %xmm7, -16(%rdi, %rdx)
|
||||
L(nop):
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(more_2x_vec):
|
||||
cmpq $64, %rdx
|
||||
jbe L(copy_4x_vec)
|
||||
|
||||
/* We use rcx later to get alignr value. */
|
||||
movq %rdi, %rcx
|
||||
|
||||
/* Backward copy for overlap + dst > src for memmove safety. */
|
||||
subq %rsi, %rcx
|
||||
cmpq %rdx, %rcx
|
||||
jb L(copy_backward)
|
||||
|
||||
/* Load tail. */
|
||||
|
||||
/* -16(%rsi, %rdx) already loaded into xmm7. */
|
||||
movups -32(%rsi, %rdx), %xmm8
|
||||
movups -48(%rsi, %rdx), %xmm9
|
||||
|
||||
/* Get misalignment. */
|
||||
andl $0xf, %ecx
|
||||
|
||||
movq %rsi, %r9
|
||||
addq %rcx, %rsi
|
||||
andq $-16, %rsi
|
||||
/* Get first vec for `palignr`. */
|
||||
movaps (%rsi), %xmm1
|
||||
|
||||
/* We have loaded (%rsi) so safe to do this store before the
|
||||
loop. */
|
||||
movups %xmm0, (%rdi)
|
||||
|
||||
#ifdef SHARED_CACHE_SIZE_HALF
|
||||
cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP
|
||||
#else
|
||||
cmp __x86_shared_cache_size_half(%rip), %rdx
|
||||
#endif
|
||||
ja L(large_memcpy)
|
||||
|
||||
leaq -64(%rdi, %rdx), %r8
|
||||
andq $-16, %rdi
|
||||
movl $48, %edx
|
||||
|
||||
leaq L(loop_fwd_start)(%rip), %r9
|
||||
sall $6, %ecx
|
||||
addq %r9, %rcx
|
||||
jmp * %rcx
|
||||
|
||||
.p2align 4,, 8
|
||||
L(copy_backward):
|
||||
testq %rcx, %rcx
|
||||
jz L(nop)
|
||||
|
||||
/* Preload tail. */
|
||||
|
||||
/* (%rsi) already loaded into xmm0. */
|
||||
movups 16(%rsi), %xmm4
|
||||
movups 32(%rsi), %xmm5
|
||||
|
||||
movq %rdi, %r8
|
||||
subq %rdi, %rsi
|
||||
leaq -49(%rdi, %rdx), %rdi
|
||||
andq $-16, %rdi
|
||||
addq %rdi, %rsi
|
||||
andq $-16, %rsi
|
||||
|
||||
movaps 48(%rsi), %xmm6
|
||||
|
||||
|
||||
leaq L(loop_bkwd_start)(%rip), %r9
|
||||
andl $0xf, %ecx
|
||||
sall $6, %ecx
|
||||
addq %r9, %rcx
|
||||
jmp * %rcx
|
||||
|
||||
.p2align 4,, 8
|
||||
L(large_memcpy):
|
||||
movups -64(%r9, %rdx), %xmm10
|
||||
movups -80(%r9, %rdx), %xmm11
|
||||
|
||||
sall $5, %ecx
|
||||
leal (%rcx, %rcx, 2), %r8d
|
||||
leaq -96(%rdi, %rdx), %rcx
|
||||
andq $-16, %rdi
|
||||
leaq L(large_loop_fwd_start)(%rip), %rdx
|
||||
addq %r8, %rdx
|
||||
jmp * %rdx
|
||||
|
||||
|
||||
/* Instead of a typical jump table all 16 loops are exactly
|
||||
64-bytes in size. So, we can just jump to first loop + r8 *
|
||||
64. Before modifying any loop ensure all their sizes match!
|
||||
*/
|
||||
.p2align 6
|
||||
L(loop_fwd_start):
|
||||
L(loop_fwd_0x0):
|
||||
movaps 16(%rsi), %xmm1
|
||||
movaps 32(%rsi), %xmm2
|
||||
movaps 48(%rsi), %xmm3
|
||||
movaps %xmm1, 16(%rdi)
|
||||
movaps %xmm2, 32(%rdi)
|
||||
movaps %xmm3, 48(%rdi)
|
||||
addq %rdx, %rdi
|
||||
addq %rdx, %rsi
|
||||
cmpq %rdi, %r8
|
||||
ja L(loop_fwd_0x0)
|
||||
L(end_loop_fwd):
|
||||
movups %xmm9, 16(%r8)
|
||||
movups %xmm8, 32(%r8)
|
||||
movups %xmm7, 48(%r8)
|
||||
ret
|
||||
|
||||
/* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
|
||||
60 bytes otherwise. */
|
||||
#define ALIGNED_LOOP_FWD(align_by); \
|
||||
.p2align 6; \
|
||||
L(loop_fwd_ ## align_by): \
|
||||
movaps 16(%rsi), %xmm0; \
|
||||
movaps 32(%rsi), %xmm2; \
|
||||
movaps 48(%rsi), %xmm3; \
|
||||
movaps %xmm3, %xmm4; \
|
||||
palignr $align_by, %xmm2, %xmm3; \
|
||||
palignr $align_by, %xmm0, %xmm2; \
|
||||
palignr $align_by, %xmm1, %xmm0; \
|
||||
movaps %xmm4, %xmm1; \
|
||||
movaps %xmm0, 16(%rdi); \
|
||||
movaps %xmm2, 32(%rdi); \
|
||||
movaps %xmm3, 48(%rdi); \
|
||||
addq %rdx, %rdi; \
|
||||
addq %rdx, %rsi; \
|
||||
cmpq %rdi, %r8; \
|
||||
ja L(loop_fwd_ ## align_by); \
|
||||
jmp L(end_loop_fwd);
|
||||
|
||||
/* Must be in descending order. */
|
||||
ALIGNED_LOOP_FWD (0xf)
|
||||
ALIGNED_LOOP_FWD (0xe)
|
||||
ALIGNED_LOOP_FWD (0xd)
|
||||
ALIGNED_LOOP_FWD (0xc)
|
||||
ALIGNED_LOOP_FWD (0xb)
|
||||
ALIGNED_LOOP_FWD (0xa)
|
||||
ALIGNED_LOOP_FWD (0x9)
|
||||
ALIGNED_LOOP_FWD (0x8)
|
||||
ALIGNED_LOOP_FWD (0x7)
|
||||
ALIGNED_LOOP_FWD (0x6)
|
||||
ALIGNED_LOOP_FWD (0x5)
|
||||
ALIGNED_LOOP_FWD (0x4)
|
||||
ALIGNED_LOOP_FWD (0x3)
|
||||
ALIGNED_LOOP_FWD (0x2)
|
||||
ALIGNED_LOOP_FWD (0x1)
|
||||
|
||||
.p2align 6
|
||||
L(large_loop_fwd_start):
|
||||
L(large_loop_fwd_0x0):
|
||||
movaps 16(%rsi), %xmm1
|
||||
movaps 32(%rsi), %xmm2
|
||||
movaps 48(%rsi), %xmm3
|
||||
movaps 64(%rsi), %xmm4
|
||||
movaps 80(%rsi), %xmm5
|
||||
movntps %xmm1, 16(%rdi)
|
||||
movntps %xmm2, 32(%rdi)
|
||||
movntps %xmm3, 48(%rdi)
|
||||
movntps %xmm4, 64(%rdi)
|
||||
movntps %xmm5, 80(%rdi)
|
||||
addq $80, %rdi
|
||||
addq $80, %rsi
|
||||
cmpq %rdi, %rcx
|
||||
ja L(large_loop_fwd_0x0)
|
||||
|
||||
/* Ensure no icache line split on tail. */
|
||||
.p2align 4
|
||||
L(end_large_loop_fwd):
|
||||
sfence
|
||||
movups %xmm11, 16(%rcx)
|
||||
movups %xmm10, 32(%rcx)
|
||||
movups %xmm9, 48(%rcx)
|
||||
movups %xmm8, 64(%rcx)
|
||||
movups %xmm7, 80(%rcx)
|
||||
ret
|
||||
|
||||
|
||||
/* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
|
||||
96-byte spacing between each. */
|
||||
#define ALIGNED_LARGE_LOOP_FWD(align_by); \
|
||||
.p2align 5; \
|
||||
L(large_loop_fwd_ ## align_by): \
|
||||
movaps 16(%rsi), %xmm0; \
|
||||
movaps 32(%rsi), %xmm2; \
|
||||
movaps 48(%rsi), %xmm3; \
|
||||
movaps 64(%rsi), %xmm4; \
|
||||
movaps 80(%rsi), %xmm5; \
|
||||
movaps %xmm5, %xmm6; \
|
||||
palignr $align_by, %xmm4, %xmm5; \
|
||||
palignr $align_by, %xmm3, %xmm4; \
|
||||
palignr $align_by, %xmm2, %xmm3; \
|
||||
palignr $align_by, %xmm0, %xmm2; \
|
||||
palignr $align_by, %xmm1, %xmm0; \
|
||||
movaps %xmm6, %xmm1; \
|
||||
movntps %xmm0, 16(%rdi); \
|
||||
movntps %xmm2, 32(%rdi); \
|
||||
movntps %xmm3, 48(%rdi); \
|
||||
movntps %xmm4, 64(%rdi); \
|
||||
movntps %xmm5, 80(%rdi); \
|
||||
addq $80, %rdi; \
|
||||
addq $80, %rsi; \
|
||||
cmpq %rdi, %rcx; \
|
||||
ja L(large_loop_fwd_ ## align_by); \
|
||||
jmp L(end_large_loop_fwd);
|
||||
|
||||
/* Must be in descending order. */
|
||||
ALIGNED_LARGE_LOOP_FWD (0xf)
|
||||
ALIGNED_LARGE_LOOP_FWD (0xe)
|
||||
ALIGNED_LARGE_LOOP_FWD (0xd)
|
||||
ALIGNED_LARGE_LOOP_FWD (0xc)
|
||||
ALIGNED_LARGE_LOOP_FWD (0xb)
|
||||
ALIGNED_LARGE_LOOP_FWD (0xa)
|
||||
ALIGNED_LARGE_LOOP_FWD (0x9)
|
||||
ALIGNED_LARGE_LOOP_FWD (0x8)
|
||||
ALIGNED_LARGE_LOOP_FWD (0x7)
|
||||
ALIGNED_LARGE_LOOP_FWD (0x6)
|
||||
ALIGNED_LARGE_LOOP_FWD (0x5)
|
||||
ALIGNED_LARGE_LOOP_FWD (0x4)
|
||||
ALIGNED_LARGE_LOOP_FWD (0x3)
|
||||
ALIGNED_LARGE_LOOP_FWD (0x2)
|
||||
ALIGNED_LARGE_LOOP_FWD (0x1)
|
||||
|
||||
|
||||
.p2align 6
|
||||
L(loop_bkwd_start):
|
||||
L(loop_bkwd_0x0):
|
||||
movaps 32(%rsi), %xmm1
|
||||
movaps 16(%rsi), %xmm2
|
||||
movaps 0(%rsi), %xmm3
|
||||
movaps %xmm1, 32(%rdi)
|
||||
movaps %xmm2, 16(%rdi)
|
||||
movaps %xmm3, 0(%rdi)
|
||||
subq $48, %rdi
|
||||
subq $48, %rsi
|
||||
cmpq %rdi, %r8
|
||||
jb L(loop_bkwd_0x0)
|
||||
L(end_loop_bkwd):
|
||||
movups %xmm7, -16(%r8, %rdx)
|
||||
movups %xmm0, 0(%r8)
|
||||
movups %xmm4, 16(%r8)
|
||||
movups %xmm5, 32(%r8)
|
||||
|
||||
ret
|
||||
|
||||
|
||||
/* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
|
||||
60 bytes otherwise. */
|
||||
#define ALIGNED_LOOP_BKWD(align_by); \
|
||||
.p2align 6; \
|
||||
L(loop_bkwd_ ## align_by): \
|
||||
movaps 32(%rsi), %xmm1; \
|
||||
movaps 16(%rsi), %xmm2; \
|
||||
movaps 0(%rsi), %xmm3; \
|
||||
palignr $align_by, %xmm1, %xmm6; \
|
||||
palignr $align_by, %xmm2, %xmm1; \
|
||||
palignr $align_by, %xmm3, %xmm2; \
|
||||
movaps %xmm6, 32(%rdi); \
|
||||
movaps %xmm1, 16(%rdi); \
|
||||
movaps %xmm2, 0(%rdi); \
|
||||
subq $48, %rdi; \
|
||||
subq $48, %rsi; \
|
||||
movaps %xmm3, %xmm6; \
|
||||
cmpq %rdi, %r8; \
|
||||
jb L(loop_bkwd_ ## align_by); \
|
||||
jmp L(end_loop_bkwd);
|
||||
|
||||
/* Must be in descending order. */
|
||||
ALIGNED_LOOP_BKWD (0xf)
|
||||
ALIGNED_LOOP_BKWD (0xe)
|
||||
ALIGNED_LOOP_BKWD (0xd)
|
||||
ALIGNED_LOOP_BKWD (0xc)
|
||||
ALIGNED_LOOP_BKWD (0xb)
|
||||
ALIGNED_LOOP_BKWD (0xa)
|
||||
ALIGNED_LOOP_BKWD (0x9)
|
||||
ALIGNED_LOOP_BKWD (0x8)
|
||||
ALIGNED_LOOP_BKWD (0x7)
|
||||
ALIGNED_LOOP_BKWD (0x6)
|
||||
ALIGNED_LOOP_BKWD (0x5)
|
||||
ALIGNED_LOOP_BKWD (0x4)
|
||||
ALIGNED_LOOP_BKWD (0x3)
|
||||
ALIGNED_LOOP_BKWD (0x2)
|
||||
ALIGNED_LOOP_BKWD (0x1)
|
||||
END(MEMMOVE)
|
||||
|
||||
strong_alias (MEMMOVE, MEMCPY)
|
||||
strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
|
||||
|
Loading…
Reference in New Issue
Block a user