x86: Improve vec generation in memset-vec-unaligned-erms.S

No bug.

Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.

For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.

Results for memset-avx2 small (geomean of N = 20 benchset runs).

size, New Time, Old Time, New / Old
   0,    4.100,    3.831,     0.934
   1,    5.074,    4.399,     0.867
   2,    4.433,    4.411,     0.995
   4,    4.487,    4.415,     0.984
   8,    4.454,    4.396,     0.987
  16,    4.502,    4.443,     0.987

All relevant string/wcsmbs tests are passing.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

(cherry picked from commit b62ace2740)
This commit is contained in:
Noah Goldstein 2022-02-06 00:54:18 -06:00 committed by Sunil K Pandey
parent 36766c02af
commit 58596411ad
5 changed files with 152 additions and 87 deletions

View File

@ -28,17 +28,22 @@
#define VMOVU movups
#define VMOVA movaps
#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
movq r, %rax; \
punpcklbw %xmm0, %xmm0; \
punpcklwd %xmm0, %xmm0; \
pshufd $0, %xmm0, %xmm0
pxor %xmm1, %xmm1; \
pshufb %xmm1, %xmm0; \
movq r, %rax
#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
movq r, %rax; \
pshufd $0, %xmm0, %xmm0
pshufd $0, %xmm0, %xmm0; \
movq r, %rax
# define MEMSET_VDUP_TO_VEC0_HIGH()
# define MEMSET_VDUP_TO_VEC0_LOW()
# define WMEMSET_VDUP_TO_VEC0_HIGH()
# define WMEMSET_VDUP_TO_VEC0_LOW()
#define SECTION(p) p

View File

@ -10,15 +10,18 @@
# define VMOVU vmovdqu
# define VMOVA vmovdqa
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax; \
vpbroadcastb %xmm0, %ymm0
movq r, %rax;
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax; \
vpbroadcastd %xmm0, %ymm0
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
# ifndef SECTION
# define SECTION(p) p##.avx
@ -30,5 +33,6 @@
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# endif
# define USE_XMM_LESS_VEC
# include "memset-vec-unaligned-erms.S"
#endif

View File

@ -15,13 +15,19 @@
# define VZEROUPPER
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
movq r, %rax; \
vpbroadcastb d, %VEC0
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vpbroadcastb d, %VEC0; \
movq r, %rax
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
movq r, %rax; \
vpbroadcastd d, %VEC0
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vpbroadcastd d, %VEC0; \
movq r, %rax
# define MEMSET_VDUP_TO_VEC0_HIGH()
# define MEMSET_VDUP_TO_VEC0_LOW()
# define WMEMSET_VDUP_TO_VEC0_HIGH()
# define WMEMSET_VDUP_TO_VEC0_LOW()
# define SECTION(p) p##.evex512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s

View File

@ -15,13 +15,19 @@
# define VZEROUPPER
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
movq r, %rax; \
vpbroadcastb d, %VEC0
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vpbroadcastb d, %VEC0; \
movq r, %rax
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
movq r, %rax; \
vpbroadcastd d, %VEC0
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vpbroadcastd d, %VEC0; \
movq r, %rax
# define MEMSET_VDUP_TO_VEC0_HIGH()
# define MEMSET_VDUP_TO_VEC0_LOW()
# define WMEMSET_VDUP_TO_VEC0_HIGH()
# define WMEMSET_VDUP_TO_VEC0_LOW()
# define SECTION(p) p##.evex
# define MEMSET_SYMBOL(p,s) p##_evex_##s

View File

@ -58,8 +58,10 @@
#ifndef MOVQ
# if VEC_SIZE > 16
# define MOVQ vmovq
# define MOVD vmovd
# else
# define MOVQ movq
# define MOVD movd
# endif
#endif
@ -72,9 +74,17 @@
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
# define END_REG rcx
# define LOOP_REG rdi
# define LESS_VEC_REG rax
#else
# define END_REG rdi
# define LOOP_REG rdx
# define LESS_VEC_REG rdi
#endif
#ifdef USE_XMM_LESS_VEC
# define XMM_SMALL 1
#else
# define XMM_SMALL 0
#endif
#define PAGE_SIZE 4096
@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
shl $2, %RDX_LP
WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
jmp L(entry_from_bzero)
WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
WMEMSET_VDUP_TO_VEC0_LOW()
cmpq $VEC_SIZE, %rdx
jb L(less_vec_no_vdup)
WMEMSET_VDUP_TO_VEC0_HIGH()
jmp L(entry_from_wmemset)
END (WMEMSET_SYMBOL (__wmemset, unaligned))
#endif
@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
#endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
mov %edx, %edx
@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
L(entry_from_bzero):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
MEMSET_VDUP_TO_VEC0_HIGH()
L(entry_from_wmemset):
cmpq $(VEC_SIZE * 2), %rdx
ja L(more_2x_vec)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
# endif
ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
mov %edx, %edx
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
MEMSET_VDUP_TO_VEC0_HIGH ()
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(stosb_more_2x_vec)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
*/
VMOVU %VEC(0), (%rax)
VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
.p2align 4,, 10
.p2align 4,, 4
L(last_2x_vec):
#ifdef USE_LESS_VEC_MASK_STORE
VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
#else
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
@ -212,6 +228,7 @@ L(last_2x_vec):
#ifdef USE_LESS_VEC_MASK_STORE
.p2align 4,, 10
L(less_vec):
L(less_vec_no_vdup):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
and (4x, 8x] jump to target. */
L(more_2x_vec):
/* Two different methods of setting up pointers / compare. The
two methods are based on the fact that EVEX/AVX512 mov
instructions take more bytes then AVX2/SSE2 mov instructions. As
well that EVEX/AVX512 machines also have fast LEA_BID. Both
setup and END_REG to avoid complex address mode. For EVEX/AVX512
this saves code size and keeps a few targets in one fetch block.
For AVX2/SSE2 this helps prevent AGU bottlenecks. */
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
LOOP_4X_OFFSET) with LEA_BID. */
/* END_REG is rcx for EVEX/AVX512. */
leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
#endif
/* Stores to first 2x VEC before cmp as any path forward will
require it. */
VMOVU %VEC(0), (%rax)
VMOVU %VEC(0), VEC_SIZE(%rax)
/* Store next 2x vec regardless. */
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
/* Two different methods of setting up pointers / compare. The two
methods are based on the fact that EVEX/AVX512 mov instructions take
more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
machines also have fast LEA_BID. Both setup and END_REG to avoid complex
address mode. For EVEX/AVX512 this saves code size and keeps a few
targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
bottlenecks. */
#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
addq %rdx, %END_REG
@ -292,6 +299,15 @@ L(more_2x_vec):
cmpq $(VEC_SIZE * 4), %rdx
jbe L(last_2x_vec)
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
LEA_BID. */
/* END_REG is rcx for EVEX/AVX512. */
leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
#endif
/* Store next 2x vec regardless. */
VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
@ -355,65 +371,93 @@ L(stosb_local):
/* Define L(less_vec) only if not otherwise defined. */
.p2align 4
L(less_vec):
/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
xmm). This is only does anything for AVX2. */
MEMSET_VDUP_TO_VEC0_LOW ()
L(less_vec_no_vdup):
#endif
L(cross_page):
#if VEC_SIZE > 32
cmpl $32, %edx
jae L(between_32_63)
jge L(between_32_63)
#endif
#if VEC_SIZE > 16
cmpl $16, %edx
jae L(between_16_31)
jge L(between_16_31)
#endif
#ifndef USE_XMM_LESS_VEC
MOVQ %XMM0, %rcx
#endif
MOVQ %XMM0, %rdi
cmpl $8, %edx
jae L(between_8_15)
jge L(between_8_15)
cmpl $4, %edx
jae L(between_4_7)
jge L(between_4_7)
cmpl $1, %edx
ja L(between_2_3)
jb L(return)
movb %sil, (%rax)
VZEROUPPER_RETURN
jg L(between_2_3)
jl L(between_0_0)
movb %sil, (%LESS_VEC_REG)
L(between_0_0):
ret
/* Align small targets only if not doing so would cross a fetch
line. */
/* Align small targets only if not doing so would cross a fetch line.
*/
#if VEC_SIZE > 32
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
VMOVU %YMM0, (%rax)
VMOVU %YMM0, -32(%rax, %rdx)
VMOVU %YMM0, (%LESS_VEC_REG)
VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
VZEROUPPER_RETURN
#endif
#if VEC_SIZE >= 32
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
L(between_16_31):
/* From 16 to 31. No branch when size == 16. */
VMOVU %XMM0, (%rax)
VMOVU %XMM0, -16(%rax, %rdx)
VZEROUPPER_RETURN
VMOVU %XMM0, (%LESS_VEC_REG)
VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
ret
#endif
.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
*/
.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
movq %rdi, (%rax)
movq %rdi, -8(%rax, %rdx)
VZEROUPPER_RETURN
#ifdef USE_XMM_LESS_VEC
MOVQ %XMM0, (%rdi)
MOVQ %XMM0, -8(%rdi, %rdx)
#else
movq %rcx, (%LESS_VEC_REG)
movq %rcx, -8(%LESS_VEC_REG, %rdx)
#endif
ret
.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
*/
.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
movl %edi, (%rax)
movl %edi, -4(%rax, %rdx)
VZEROUPPER_RETURN
#ifdef USE_XMM_LESS_VEC
MOVD %XMM0, (%rdi)
MOVD %XMM0, -4(%rdi, %rdx)
#else
movl %ecx, (%LESS_VEC_REG)
movl %ecx, -4(%LESS_VEC_REG, %rdx)
#endif
ret
.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
/* 4 * XMM_SMALL for the third mov for AVX2. */
.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
movw %di, (%rax)
movb %dil, -1(%rax, %rdx)
VZEROUPPER_RETURN
#ifdef USE_XMM_LESS_VEC
movb %sil, (%rdi)
movb %sil, 1(%rdi)
movb %sil, -1(%rdi, %rdx)
#else
movw %cx, (%LESS_VEC_REG)
movb %sil, -1(%LESS_VEC_REG, %rdx)
#endif
ret
END (MEMSET_SYMBOL (__memset, unaligned_erms))