x86: Reduce code size of mem{move|pcpy|cpy}-ssse3

The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are generally preferable. memcpy/memmove is one exception where avoiding unaligned loads with `palignr` is important for some targets. This commit replaces memmove-ssse3 with a better optimized are lower code footprint verion. As well it aliases memcpy to memmove. Aside from this function all other SSSE3 functions should be safe to remove. The performance is not changed drastically although shows overall improvements without any major regressions or gains. bench-memcpy geometric_mean(N=50) New / Original: 0.957 bench-memcpy-random geometric_mean(N=50) New / Original: 0.912 bench-memcpy-large geometric_mean(N=50) New / Original: 0.892 Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers for all results. More important this saves 7246 bytes of code size in memmove an additional 10741 bytes by reusing memmove code for memcpy (total 17987 bytes saves). As well an additional 896 bytes of rodata for the jump table entries.
2024-11-22 13:00:06 +00:00 · 2022-04-14 11:47:40 -05:00 · 2022-04-14 11:47:40 -05:00 · 26b2478322
commit 26b2478322
parent d85916e30a
3 changed files with 380 additions and 3156 deletions
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@ -16,7 +16,6 @@ sysdep_routines += \
  memcmpeq-avx2-rtm \
  memcmpeq-evex \
  memcmpeq-sse2 \
-  memcpy-ssse3 \
  memmove-avx-unaligned-erms \
  memmove-avx-unaligned-erms-rtm \
  memmove-avx512-no-vzeroupper \
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@ -1,4 +1,380 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3
-#define MEMCPY_CHK	__memmove_chk_ssse3
-#include "memcpy-ssse3.S"
+#include <sysdep.h>
+
+#ifndef MEMMOVE
+# define MEMMOVE	__memmove_ssse3
+# define MEMMOVE_CHK	__memmove_chk_ssse3
+# define MEMCPY	__memcpy_ssse3
+# define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMPCPY	__mempcpy_ssse3
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3
+#endif
+
+	.section .text.ssse3, "ax", @progbits
+ENTRY(MEMPCPY_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMPCPY_CHK)
+
+ENTRY(MEMPCPY)
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start)
+END(MEMPCPY)
+
+ENTRY(MEMMOVE_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMMOVE_CHK)
+
+ENTRY_P2ALIGN(MEMMOVE, 6)
+	movq	%rdi, %rax
+L(start):
+	cmpq	$16, %rdx
+	jb	L(copy_0_15)
+
+	/* These loads are always useful.  */
+	movups	0(%rsi), %xmm0
+	movups	-16(%rsi, %rdx), %xmm7
+	cmpq	$32, %rdx
+	ja	L(more_2x_vec)
+
+	movups	%xmm0, 0(%rdi)
+	movups	%xmm7, -16(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_0_15):
+	cmpl	$4, %edx
+	jb	L(copy_0_3)
+	cmpl	$8, %edx
+	jb	L(copy_4_7)
+	movq	0(%rsi), %rcx
+	movq	-8(%rsi, %rdx), %rsi
+	movq	%rcx, 0(%rdi)
+	movq	%rsi, -8(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_4_7):
+	movl	0(%rsi), %ecx
+	movl	-4(%rsi, %rdx), %esi
+	movl	%ecx, 0(%rdi)
+	movl	%esi, -4(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_0_3):
+	decl	%edx
+	jl	L(copy_0_0)
+	movb	(%rsi), %cl
+	je	L(copy_1_1)
+
+	movzwl	-1(%rsi, %rdx), %esi
+	movw	%si, -1(%rdi, %rdx)
+L(copy_1_1):
+	movb	%cl, (%rdi)
+L(copy_0_0):
+	ret
+
+	.p2align 4,, 4
+L(copy_4x_vec):
+	movups	16(%rsi), %xmm1
+	movups	-32(%rsi, %rdx), %xmm2
+
+	movups	%xmm0, 0(%rdi)
+	movups	%xmm1, 16(%rdi)
+	movups	%xmm2, -32(%rdi, %rdx)
+	movups	%xmm7, -16(%rdi, %rdx)
+L(nop):
+	ret
+
+	.p2align 4
+L(more_2x_vec):
+	cmpq	$64, %rdx
+	jbe	L(copy_4x_vec)
+
+	/* We use rcx later to get alignr value.  */
+	movq	%rdi, %rcx
+
+	/* Backward copy for overlap + dst > src for memmove safety.  */
+	subq	%rsi, %rcx
+	cmpq	%rdx, %rcx
+	jb	L(copy_backward)
+
+	/* Load tail.  */
+
+	/* -16(%rsi, %rdx) already loaded into xmm7.  */
+	movups	-32(%rsi, %rdx), %xmm8
+	movups	-48(%rsi, %rdx), %xmm9
+
+	/* Get misalignment.  */
+	andl	$0xf, %ecx
+
+	movq	%rsi, %r9
+	addq	%rcx, %rsi
+	andq	$-16, %rsi
+	/* Get first vec for `palignr`.  */
+	movaps	(%rsi), %xmm1
+
+	/* We have loaded (%rsi) so safe to do this store before the
+	   loop.  */
+	movups	%xmm0, (%rdi)
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %RDX_LP
+#else
+	cmp	__x86_shared_cache_size_half(%rip), %rdx
+#endif
+	ja	L(large_memcpy)
+
+	leaq	-64(%rdi, %rdx), %r8
+	andq	$-16, %rdi
+	movl	$48, %edx
+
+	leaq	L(loop_fwd_start)(%rip), %r9
+	sall	$6, %ecx
+	addq	%r9, %rcx
+	jmp	* %rcx
+
+	.p2align 4,, 8
+L(copy_backward):
+	testq	%rcx, %rcx
+	jz	L(nop)
+
+	/* Preload tail.  */
+
+	/* (%rsi) already loaded into xmm0.  */
+	movups	16(%rsi), %xmm4
+	movups	32(%rsi), %xmm5
+
+	movq	%rdi, %r8
+	subq	%rdi, %rsi
+	leaq	-49(%rdi, %rdx), %rdi
+	andq	$-16, %rdi
+	addq	%rdi, %rsi
+	andq	$-16, %rsi
+
+	movaps	48(%rsi), %xmm6
+
+
+	leaq	L(loop_bkwd_start)(%rip), %r9
+	andl	$0xf, %ecx
+	sall	$6, %ecx
+	addq	%r9, %rcx
+	jmp	* %rcx
+
+	.p2align 4,, 8
+L(large_memcpy):
+	movups	-64(%r9, %rdx), %xmm10
+	movups	-80(%r9, %rdx), %xmm11
+
+	sall	$5, %ecx
+	leal	(%rcx, %rcx, 2), %r8d
+	leaq	-96(%rdi, %rdx), %rcx
+	andq	$-16, %rdi
+	leaq	L(large_loop_fwd_start)(%rip), %rdx
+	addq	%r8, %rdx
+	jmp	* %rdx
+
+
+	/* Instead of a typical jump table all 16 loops are exactly
+	   64-bytes in size. So, we can just jump to first loop + r8 *
+	   64. Before modifying any loop ensure all their sizes match!
+	 */
+	.p2align 6
+L(loop_fwd_start):
+L(loop_fwd_0x0):
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	%xmm1, 16(%rdi)
+	movaps	%xmm2, 32(%rdi)
+	movaps	%xmm3, 48(%rdi)
+	addq	%rdx, %rdi
+	addq	%rdx, %rsi
+	cmpq	%rdi, %r8
+	ja	L(loop_fwd_0x0)
+L(end_loop_fwd):
+	movups	%xmm9, 16(%r8)
+	movups	%xmm8, 32(%r8)
+	movups	%xmm7, 48(%r8)
+	ret
+
+	/* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
+	   60 bytes otherwise.  */
+#define ALIGNED_LOOP_FWD(align_by);	\
+	.p2align 6;	\
+L(loop_fwd_ ## align_by):	\
+	movaps	16(%rsi), %xmm0;	\
+	movaps	32(%rsi), %xmm2;	\
+	movaps	48(%rsi), %xmm3;	\
+	movaps	%xmm3, %xmm4;	\
+	palignr	$align_by, %xmm2, %xmm3;	\
+	palignr	$align_by, %xmm0, %xmm2;	\
+	palignr	$align_by, %xmm1, %xmm0;	\
+	movaps	%xmm4, %xmm1;	\
+	movaps	%xmm0, 16(%rdi);	\
+	movaps	%xmm2, 32(%rdi);	\
+	movaps	%xmm3, 48(%rdi);	\
+	addq	%rdx, %rdi;	\
+	addq	%rdx, %rsi;	\
+	cmpq	%rdi, %r8;	\
+	ja	L(loop_fwd_ ## align_by);	\
+	jmp	L(end_loop_fwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LOOP_FWD (0xf)
+	ALIGNED_LOOP_FWD (0xe)
+	ALIGNED_LOOP_FWD (0xd)
+	ALIGNED_LOOP_FWD (0xc)
+	ALIGNED_LOOP_FWD (0xb)
+	ALIGNED_LOOP_FWD (0xa)
+	ALIGNED_LOOP_FWD (0x9)
+	ALIGNED_LOOP_FWD (0x8)
+	ALIGNED_LOOP_FWD (0x7)
+	ALIGNED_LOOP_FWD (0x6)
+	ALIGNED_LOOP_FWD (0x5)
+	ALIGNED_LOOP_FWD (0x4)
+	ALIGNED_LOOP_FWD (0x3)
+	ALIGNED_LOOP_FWD (0x2)
+	ALIGNED_LOOP_FWD (0x1)
+
+	.p2align 6
+L(large_loop_fwd_start):
+L(large_loop_fwd_0x0):
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	64(%rsi), %xmm4
+	movaps	80(%rsi), %xmm5
+	movntps	%xmm1, 16(%rdi)
+	movntps	%xmm2, 32(%rdi)
+	movntps	%xmm3, 48(%rdi)
+	movntps	%xmm4, 64(%rdi)
+	movntps	%xmm5, 80(%rdi)
+	addq	$80, %rdi
+	addq	$80, %rsi
+	cmpq	%rdi, %rcx
+	ja	L(large_loop_fwd_0x0)
+
+	/* Ensure no icache line split on tail.  */
+	.p2align 4
+L(end_large_loop_fwd):
+	sfence
+	movups	%xmm11, 16(%rcx)
+	movups	%xmm10, 32(%rcx)
+	movups	%xmm9, 48(%rcx)
+	movups	%xmm8, 64(%rcx)
+	movups	%xmm7, 80(%rcx)
+	ret
+
+
+	/* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
+	   96-byte spacing between each.  */
+#define ALIGNED_LARGE_LOOP_FWD(align_by);	\
+	.p2align 5;	\
+L(large_loop_fwd_ ## align_by):	\
+	movaps	16(%rsi), %xmm0;	\
+	movaps	32(%rsi), %xmm2;	\
+	movaps	48(%rsi), %xmm3;	\
+	movaps	64(%rsi), %xmm4;	\
+	movaps	80(%rsi), %xmm5;	\
+	movaps	%xmm5, %xmm6;	\
+	palignr	$align_by, %xmm4, %xmm5;	\
+	palignr	$align_by, %xmm3, %xmm4;	\
+	palignr	$align_by, %xmm2, %xmm3;	\
+	palignr	$align_by, %xmm0, %xmm2;	\
+	palignr	$align_by, %xmm1, %xmm0;	\
+	movaps	%xmm6, %xmm1;	\
+	movntps	%xmm0, 16(%rdi);	\
+	movntps	%xmm2, 32(%rdi);	\
+	movntps	%xmm3, 48(%rdi);	\
+	movntps	%xmm4, 64(%rdi);	\
+	movntps	%xmm5, 80(%rdi);	\
+	addq	$80, %rdi;	\
+	addq	$80, %rsi;	\
+	cmpq	%rdi, %rcx;	\
+	ja	L(large_loop_fwd_ ## align_by);	\
+	jmp	L(end_large_loop_fwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LARGE_LOOP_FWD (0xf)
+	ALIGNED_LARGE_LOOP_FWD (0xe)
+	ALIGNED_LARGE_LOOP_FWD (0xd)
+	ALIGNED_LARGE_LOOP_FWD (0xc)
+	ALIGNED_LARGE_LOOP_FWD (0xb)
+	ALIGNED_LARGE_LOOP_FWD (0xa)
+	ALIGNED_LARGE_LOOP_FWD (0x9)
+	ALIGNED_LARGE_LOOP_FWD (0x8)
+	ALIGNED_LARGE_LOOP_FWD (0x7)
+	ALIGNED_LARGE_LOOP_FWD (0x6)
+	ALIGNED_LARGE_LOOP_FWD (0x5)
+	ALIGNED_LARGE_LOOP_FWD (0x4)
+	ALIGNED_LARGE_LOOP_FWD (0x3)
+	ALIGNED_LARGE_LOOP_FWD (0x2)
+	ALIGNED_LARGE_LOOP_FWD (0x1)
+
+
+	.p2align 6
+L(loop_bkwd_start):
+L(loop_bkwd_0x0):
+	movaps	32(%rsi), %xmm1
+	movaps	16(%rsi), %xmm2
+	movaps	0(%rsi), %xmm3
+	movaps	%xmm1, 32(%rdi)
+	movaps	%xmm2, 16(%rdi)
+	movaps	%xmm3, 0(%rdi)
+	subq	$48, %rdi
+	subq	$48, %rsi
+	cmpq	%rdi, %r8
+	jb	L(loop_bkwd_0x0)
+L(end_loop_bkwd):
+	movups	%xmm7, -16(%r8, %rdx)
+	movups	%xmm0, 0(%r8)
+	movups	%xmm4, 16(%r8)
+	movups	%xmm5, 32(%r8)
+
+	ret
+
+
+	/* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
+	   60 bytes otherwise.  */
+#define ALIGNED_LOOP_BKWD(align_by);	\
+	.p2align 6;	\
+L(loop_bkwd_ ## align_by):	\
+	movaps	32(%rsi), %xmm1;	\
+	movaps	16(%rsi), %xmm2;	\
+	movaps	0(%rsi), %xmm3;	\
+	palignr	$align_by, %xmm1, %xmm6;	\
+	palignr	$align_by, %xmm2, %xmm1;	\
+	palignr	$align_by, %xmm3, %xmm2;	\
+	movaps	%xmm6, 32(%rdi);	\
+	movaps	%xmm1, 16(%rdi);	\
+	movaps	%xmm2, 0(%rdi);	\
+	subq	$48, %rdi;	\
+	subq	$48, %rsi;	\
+	movaps	%xmm3, %xmm6;	\
+	cmpq	%rdi, %r8;	\
+	jb	L(loop_bkwd_ ## align_by);	\
+	jmp	L(end_loop_bkwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LOOP_BKWD (0xf)
+	ALIGNED_LOOP_BKWD (0xe)
+	ALIGNED_LOOP_BKWD (0xd)
+	ALIGNED_LOOP_BKWD (0xc)
+	ALIGNED_LOOP_BKWD (0xb)
+	ALIGNED_LOOP_BKWD (0xa)
+	ALIGNED_LOOP_BKWD (0x9)
+	ALIGNED_LOOP_BKWD (0x8)
+	ALIGNED_LOOP_BKWD (0x7)
+	ALIGNED_LOOP_BKWD (0x6)
+	ALIGNED_LOOP_BKWD (0x5)
+	ALIGNED_LOOP_BKWD (0x4)
+	ALIGNED_LOOP_BKWD (0x3)
+	ALIGNED_LOOP_BKWD (0x2)
+	ALIGNED_LOOP_BKWD (0x1)
+END(MEMMOVE)
+
+strong_alias (MEMMOVE, MEMCPY)
+strong_alias (MEMMOVE_CHK, MEMCPY_CHK)