glibc/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
H.J. Lu 3d9f171bfb x86-64: Optimize bzero
memset with zero as the value to set is by far the majority value (99%+
for Python3 and GCC).

bzero can be slightly more optimized for this case by using a zero-idiom
xor for broadcasting the set value to a register (vector or GPR).

Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
2022-02-08 15:58:56 -08:00

45 lines
1015 B
ArmAsm

#if IS_IN (libc)
# define USE_WITH_AVX2 1
# define VEC_SIZE 32
# define MOV_SIZE 4
# define RET_SIZE 4
# define VEC(i) ymm##i
# define VMOVU vmovdqu
# define VMOVA vmovdqa
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax;
# define BZERO_ZERO_VEC0() \
vpxor %xmm0, %xmm0, %xmm0
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
# ifndef MEMSET_SYMBOL
# define MEMSET_SYMBOL(p,s) p##_avx2_##s
# endif
# ifndef BZERO_SYMBOL
# define BZERO_SYMBOL(p,s) p##_avx2_##s
# endif
# ifndef WMEMSET_SYMBOL
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# endif
# define USE_XMM_LESS_VEC
# include "memset-vec-unaligned-erms.S"
#endif