mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-10 03:10:09 +00:00
x86: Improve vec generation in memset-vec-unaligned-erms.S
No bug.
Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.
Results for memset-avx2 small (geomean of N = 20 benchset runs).
size, New Time, Old Time, New / Old
0, 4.100, 3.831, 0.934
1, 5.074, 4.399, 0.867
2, 4.433, 4.411, 0.995
4, 4.487, 4.415, 0.984
8, 4.454, 4.396, 0.987
16, 4.502, 4.443, 0.987
All relevant string/wcsmbs tests are passing.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit b62ace2740
)
This commit is contained in:
parent
36766c02af
commit
58596411ad
@ -28,17 +28,22 @@
|
||||
#define VMOVU movups
|
||||
#define VMOVA movaps
|
||||
|
||||
#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
movq r, %rax; \
|
||||
punpcklbw %xmm0, %xmm0; \
|
||||
punpcklwd %xmm0, %xmm0; \
|
||||
pshufd $0, %xmm0, %xmm0
|
||||
pxor %xmm1, %xmm1; \
|
||||
pshufb %xmm1, %xmm0; \
|
||||
movq r, %rax
|
||||
|
||||
#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
movq r, %rax; \
|
||||
pshufd $0, %xmm0, %xmm0
|
||||
pshufd $0, %xmm0, %xmm0; \
|
||||
movq r, %rax
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
# define MEMSET_VDUP_TO_VEC0_LOW()
|
||||
|
||||
# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
# define WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
|
||||
#define SECTION(p) p
|
||||
|
||||
|
@ -10,15 +10,18 @@
|
||||
# define VMOVU vmovdqu
|
||||
# define VMOVA vmovdqa
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
vmovd d, %xmm0; \
|
||||
movq r, %rax; \
|
||||
vpbroadcastb %xmm0, %ymm0
|
||||
movq r, %rax;
|
||||
|
||||
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
vmovd d, %xmm0; \
|
||||
movq r, %rax; \
|
||||
vpbroadcastd %xmm0, %ymm0
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
|
||||
# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
|
||||
|
||||
# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
|
||||
# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
|
||||
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
@ -30,5 +33,6 @@
|
||||
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
|
||||
# endif
|
||||
|
||||
# define USE_XMM_LESS_VEC
|
||||
# include "memset-vec-unaligned-erms.S"
|
||||
#endif
|
||||
|
@ -15,13 +15,19 @@
|
||||
|
||||
# define VZEROUPPER
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
movq r, %rax; \
|
||||
vpbroadcastb d, %VEC0
|
||||
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
vpbroadcastb d, %VEC0; \
|
||||
movq r, %rax
|
||||
|
||||
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
movq r, %rax; \
|
||||
vpbroadcastd d, %VEC0
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
vpbroadcastd d, %VEC0; \
|
||||
movq r, %rax
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
# define MEMSET_VDUP_TO_VEC0_LOW()
|
||||
|
||||
# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
# define WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
|
||||
# define SECTION(p) p##.evex512
|
||||
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
||||
|
@ -15,13 +15,19 @@
|
||||
|
||||
# define VZEROUPPER
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
movq r, %rax; \
|
||||
vpbroadcastb d, %VEC0
|
||||
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
vpbroadcastb d, %VEC0; \
|
||||
movq r, %rax
|
||||
|
||||
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
movq r, %rax; \
|
||||
vpbroadcastd d, %VEC0
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
vpbroadcastd d, %VEC0; \
|
||||
movq r, %rax
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
# define MEMSET_VDUP_TO_VEC0_LOW()
|
||||
|
||||
# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
# define WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
|
||||
# define SECTION(p) p##.evex
|
||||
# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
||||
|
@ -58,8 +58,10 @@
|
||||
#ifndef MOVQ
|
||||
# if VEC_SIZE > 16
|
||||
# define MOVQ vmovq
|
||||
# define MOVD vmovd
|
||||
# else
|
||||
# define MOVQ movq
|
||||
# define MOVD movd
|
||||
# endif
|
||||
#endif
|
||||
|
||||
@ -72,9 +74,17 @@
|
||||
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
# define END_REG rcx
|
||||
# define LOOP_REG rdi
|
||||
# define LESS_VEC_REG rax
|
||||
#else
|
||||
# define END_REG rdi
|
||||
# define LOOP_REG rdx
|
||||
# define LESS_VEC_REG rdi
|
||||
#endif
|
||||
|
||||
#ifdef USE_XMM_LESS_VEC
|
||||
# define XMM_SMALL 1
|
||||
#else
|
||||
# define XMM_SMALL 0
|
||||
#endif
|
||||
|
||||
#define PAGE_SIZE 4096
|
||||
@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
||||
|
||||
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
shl $2, %RDX_LP
|
||||
WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
jmp L(entry_from_bzero)
|
||||
WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
cmpq $VEC_SIZE, %rdx
|
||||
jb L(less_vec_no_vdup)
|
||||
WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
jmp L(entry_from_wmemset)
|
||||
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
#endif
|
||||
|
||||
@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
||||
#endif
|
||||
|
||||
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
||||
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
mov %edx, %edx
|
||||
@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
||||
L(entry_from_bzero):
|
||||
cmpq $VEC_SIZE, %rdx
|
||||
jb L(less_vec)
|
||||
MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
L(entry_from_wmemset):
|
||||
cmpq $(VEC_SIZE * 2), %rdx
|
||||
ja L(more_2x_vec)
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||||
# endif
|
||||
|
||||
ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
|
||||
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
mov %edx, %edx
|
||||
# endif
|
||||
cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
MEMSET_VDUP_TO_VEC0_HIGH ()
|
||||
cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(stosb_more_2x_vec)
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
|
||||
*/
|
||||
VMOVU %VEC(0), (%rax)
|
||||
VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
|
||||
.p2align 4,, 10
|
||||
.p2align 4,, 4
|
||||
L(last_2x_vec):
|
||||
#ifdef USE_LESS_VEC_MASK_STORE
|
||||
VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
|
||||
VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
|
||||
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
|
||||
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
||||
#else
|
||||
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
|
||||
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
|
||||
@ -212,6 +228,7 @@ L(last_2x_vec):
|
||||
#ifdef USE_LESS_VEC_MASK_STORE
|
||||
.p2align 4,, 10
|
||||
L(less_vec):
|
||||
L(less_vec_no_vdup):
|
||||
/* Less than 1 VEC. */
|
||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
# error Unsupported VEC_SIZE!
|
||||
@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
|
||||
/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
|
||||
and (4x, 8x] jump to target. */
|
||||
L(more_2x_vec):
|
||||
|
||||
/* Two different methods of setting up pointers / compare. The
|
||||
two methods are based on the fact that EVEX/AVX512 mov
|
||||
instructions take more bytes then AVX2/SSE2 mov instructions. As
|
||||
well that EVEX/AVX512 machines also have fast LEA_BID. Both
|
||||
setup and END_REG to avoid complex address mode. For EVEX/AVX512
|
||||
this saves code size and keeps a few targets in one fetch block.
|
||||
For AVX2/SSE2 this helps prevent AGU bottlenecks. */
|
||||
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
|
||||
LOOP_4X_OFFSET) with LEA_BID. */
|
||||
|
||||
/* END_REG is rcx for EVEX/AVX512. */
|
||||
leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
||||
#endif
|
||||
|
||||
/* Stores to first 2x VEC before cmp as any path forward will
|
||||
require it. */
|
||||
VMOVU %VEC(0), (%rax)
|
||||
VMOVU %VEC(0), VEC_SIZE(%rax)
|
||||
/* Store next 2x vec regardless. */
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
|
||||
|
||||
|
||||
/* Two different methods of setting up pointers / compare. The two
|
||||
methods are based on the fact that EVEX/AVX512 mov instructions take
|
||||
more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
|
||||
machines also have fast LEA_BID. Both setup and END_REG to avoid complex
|
||||
address mode. For EVEX/AVX512 this saves code size and keeps a few
|
||||
targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
|
||||
bottlenecks. */
|
||||
#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
|
||||
/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
|
||||
addq %rdx, %END_REG
|
||||
@ -292,6 +299,15 @@ L(more_2x_vec):
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
jbe L(last_2x_vec)
|
||||
|
||||
|
||||
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
|
||||
LEA_BID. */
|
||||
|
||||
/* END_REG is rcx for EVEX/AVX512. */
|
||||
leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
||||
#endif
|
||||
|
||||
/* Store next 2x vec regardless. */
|
||||
VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
|
||||
VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
|
||||
@ -355,65 +371,93 @@ L(stosb_local):
|
||||
/* Define L(less_vec) only if not otherwise defined. */
|
||||
.p2align 4
|
||||
L(less_vec):
|
||||
/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
|
||||
xmm). This is only does anything for AVX2. */
|
||||
MEMSET_VDUP_TO_VEC0_LOW ()
|
||||
L(less_vec_no_vdup):
|
||||
#endif
|
||||
L(cross_page):
|
||||
#if VEC_SIZE > 32
|
||||
cmpl $32, %edx
|
||||
jae L(between_32_63)
|
||||
jge L(between_32_63)
|
||||
#endif
|
||||
#if VEC_SIZE > 16
|
||||
cmpl $16, %edx
|
||||
jae L(between_16_31)
|
||||
jge L(between_16_31)
|
||||
#endif
|
||||
#ifndef USE_XMM_LESS_VEC
|
||||
MOVQ %XMM0, %rcx
|
||||
#endif
|
||||
MOVQ %XMM0, %rdi
|
||||
cmpl $8, %edx
|
||||
jae L(between_8_15)
|
||||
jge L(between_8_15)
|
||||
cmpl $4, %edx
|
||||
jae L(between_4_7)
|
||||
jge L(between_4_7)
|
||||
cmpl $1, %edx
|
||||
ja L(between_2_3)
|
||||
jb L(return)
|
||||
movb %sil, (%rax)
|
||||
VZEROUPPER_RETURN
|
||||
jg L(between_2_3)
|
||||
jl L(between_0_0)
|
||||
movb %sil, (%LESS_VEC_REG)
|
||||
L(between_0_0):
|
||||
ret
|
||||
|
||||
/* Align small targets only if not doing so would cross a fetch
|
||||
line. */
|
||||
/* Align small targets only if not doing so would cross a fetch line.
|
||||
*/
|
||||
#if VEC_SIZE > 32
|
||||
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
L(between_32_63):
|
||||
VMOVU %YMM0, (%rax)
|
||||
VMOVU %YMM0, -32(%rax, %rdx)
|
||||
VMOVU %YMM0, (%LESS_VEC_REG)
|
||||
VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
|
||||
#if VEC_SIZE >= 32
|
||||
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
||||
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
|
||||
L(between_16_31):
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
VMOVU %XMM0, (%rax)
|
||||
VMOVU %XMM0, -16(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
VMOVU %XMM0, (%LESS_VEC_REG)
|
||||
VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
|
||||
ret
|
||||
#endif
|
||||
|
||||
.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
||||
/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
|
||||
*/
|
||||
.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
|
||||
L(between_8_15):
|
||||
/* From 8 to 15. No branch when size == 8. */
|
||||
movq %rdi, (%rax)
|
||||
movq %rdi, -8(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#ifdef USE_XMM_LESS_VEC
|
||||
MOVQ %XMM0, (%rdi)
|
||||
MOVQ %XMM0, -8(%rdi, %rdx)
|
||||
#else
|
||||
movq %rcx, (%LESS_VEC_REG)
|
||||
movq %rcx, -8(%LESS_VEC_REG, %rdx)
|
||||
#endif
|
||||
ret
|
||||
|
||||
.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
|
||||
/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
|
||||
*/
|
||||
.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
|
||||
L(between_4_7):
|
||||
/* From 4 to 7. No branch when size == 4. */
|
||||
movl %edi, (%rax)
|
||||
movl %edi, -4(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#ifdef USE_XMM_LESS_VEC
|
||||
MOVD %XMM0, (%rdi)
|
||||
MOVD %XMM0, -4(%rdi, %rdx)
|
||||
#else
|
||||
movl %ecx, (%LESS_VEC_REG)
|
||||
movl %ecx, -4(%LESS_VEC_REG, %rdx)
|
||||
#endif
|
||||
ret
|
||||
|
||||
.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
||||
/* 4 * XMM_SMALL for the third mov for AVX2. */
|
||||
.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
|
||||
L(between_2_3):
|
||||
/* From 2 to 3. No branch when size == 2. */
|
||||
movw %di, (%rax)
|
||||
movb %dil, -1(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#ifdef USE_XMM_LESS_VEC
|
||||
movb %sil, (%rdi)
|
||||
movb %sil, 1(%rdi)
|
||||
movb %sil, -1(%rdi, %rdx)
|
||||
#else
|
||||
movw %cx, (%LESS_VEC_REG)
|
||||
movb %sil, -1(%LESS_VEC_REG, %rdx)
|
||||
#endif
|
||||
ret
|
||||
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
|
Loading…
Reference in New Issue
Block a user