glibc/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
H.J. Lu 5cb6329652 x86-64: Optimize bzero
memset with zero as the value to set is by far the majority value (99%+
for Python3 and GCC).

bzero can be slightly more optimized for this case by using a zero-idiom
xor for broadcasting the set value to a register (vector or GPR).

Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
(cherry picked from commit 3d9f171bfb)
2022-05-05 09:10:53 -07:00

41 lines
856 B
ArmAsm

#if IS_IN (libc)
# define USE_WITH_EVEX 1
# define VEC_SIZE 32
# define MOV_SIZE 6
# define RET_SIZE 1
# define XMM0 xmm16
# define YMM0 ymm16
# define VEC0 ymm16
# define VEC(i) VEC##i
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
# define VZEROUPPER
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vpbroadcastb d, %VEC0; \
movq r, %rax
# define BZERO_ZERO_VEC0() \
vpxorq %XMM0, %XMM0, %XMM0
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vpbroadcastd d, %VEC0; \
movq r, %rax
# define MEMSET_VDUP_TO_VEC0_HIGH()
# define MEMSET_VDUP_TO_VEC0_LOW()
# define WMEMSET_VDUP_TO_VEC0_HIGH()
# define WMEMSET_VDUP_TO_VEC0_LOW()
# define SECTION(p) p##.evex
# define MEMSET_SYMBOL(p,s) p##_evex_##s
# define WMEMSET_SYMBOL(p,s) p##_evex_##s
# define USE_LESS_VEC_MASK_STORE 1
# include "memset-vec-unaligned-erms.S"
#endif