glibc/sysdeps/x86_64/multiarch/Makefile

178 lines
3.4 KiB
Makefile
Raw Normal View History

ifeq ($(subdir),string)
sysdep_routines += \
bzero \
memchr-avx2 \
memchr-avx2-rtm \
memchr-evex \
memchr-evex-rtm \
memchr-sse2 \
memcmp-avx2-movbe \
memcmp-avx2-movbe-rtm \
memcmp-evex-movbe \
memcmp-sse2 \
memcmpeq-avx2 \
memcmpeq-avx2-rtm \
memcmpeq-evex \
memcmpeq-sse2 \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
memmove-avx512-unaligned-erms \
memmove-evex-unaligned-erms \
memmove-sse2-unaligned-erms \
memmove-ssse3 \
memrchr-avx2 \
memrchr-avx2-rtm \
memrchr-evex \
memrchr-sse2 \
memset-avx2-unaligned-erms \
memset-avx2-unaligned-erms-rtm \
memset-avx512-no-vzeroupper \
memset-avx512-unaligned-erms \
memset-evex-unaligned-erms \
memset-sse2-unaligned-erms \
rawmemchr-avx2 \
rawmemchr-avx2-rtm \
rawmemchr-evex \
rawmemchr-evex-rtm \
rawmemchr-sse2 \
stpcpy-avx2 \
stpcpy-avx2-rtm \
stpcpy-evex \
stpcpy-sse2 \
stpcpy-sse2-unaligned \
stpncpy-avx2 \
stpncpy-avx2-rtm \
stpncpy-c \
stpncpy-evex \
stpncpy-sse2-unaligned \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
strcasecmp_l-evex \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcat-avx2 \
strcat-avx2-rtm \
strcat-evex \
strcat-sse2 \
strcat-sse2-unaligned \
strchr-avx2 \
strchr-avx2-rtm \
strchr-evex \
strchr-sse2 \
strchr-sse2-no-bsf \
strchrnul-avx2 \
strchrnul-avx2-rtm \
strchrnul-evex \
strchrnul-sse2 \
strcmp-avx2 \
strcmp-avx2-rtm \
strcmp-evex \
strcmp-sse2 \
strcmp-sse2-unaligned \
strcmp-sse4_2 \
strcpy-avx2 \
strcpy-avx2-rtm \
strcpy-evex \
strcpy-sse2 \
strcpy-sse2-unaligned \
strcspn-c \
strcspn-sse2 \
strlen-avx2 \
strlen-avx2-rtm \
strlen-evex \
strlen-sse2 \
strncase_l-avx2 \
strncase_l-avx2-rtm \
strncase_l-evex \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncat-avx2 \
strncat-avx2-rtm \
strncat-c \
strncat-evex \
strncat-sse2-unaligned \
strncmp-avx2 \
strncmp-avx2-rtm \
strncmp-evex \
strncmp-sse2 \
strncmp-sse4_2 \
strncpy-avx2 \
strncpy-avx2-rtm \
strncpy-c \
strncpy-evex \
strncpy-sse2-unaligned \
strnlen-avx2 \
strnlen-avx2-rtm \
strnlen-evex \
strnlen-sse2 \
strpbrk-c \
strpbrk-sse2 \
strrchr-avx2 \
strrchr-avx2-rtm \
strrchr-evex \
strrchr-sse2 \
strspn-c \
strspn-sse2 \
strstr-sse2-unaligned \
varshift \
# sysdep_routines
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
CFLAGS-strspn-c.c += -msse4
endif
ifeq ($(subdir),wcsmbs)
sysdep_routines += \
wcschr-avx2 \
wcschr-avx2-rtm \
wcschr-evex \
wcschr-sse2 \
wcscmp-avx2 \
wcscmp-avx2-rtm \
wcscmp-evex \
wcscmp-sse2 \
wcscpy-c \
wcscpy-ssse3 \
wcslen-avx2 \
wcslen-avx2-rtm \
wcslen-evex \
wcslen-sse2 \
wcslen-sse4_1 \
wcsncmp-avx2 \
wcsncmp-avx2-rtm \
wcsncmp-evex \
wcsncmp-sse2 \
wcsnlen-avx2 \
wcsnlen-avx2-rtm \
wcsnlen-c \
wcsnlen-evex \
wcsnlen-sse4_1 \
wcsrchr-avx2 \
wcsrchr-avx2-rtm \
wcsrchr-evex \
wcsrchr-sse2 \
wmemchr-avx2 \
wmemchr-avx2-rtm \
wmemchr-evex \
wmemchr-evex-rtm \
wmemchr-sse2 \
wmemcmp-avx2-movbe \
wmemcmp-avx2-movbe-rtm \
wmemcmp-evex-movbe \
wmemcmp-sse2 \
# sysdep_routines
endif
x86-64: Optimize wmemset with SSE2/AVX2/AVX512 The difference between memset and wmemset is byte vs int. Add stubs to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size: SSE2 wmemset: shl $0x2,%rdx movd %esi,%xmm0 mov %rdi,%rax pshufd $0x0,%xmm0,%xmm0 jmp entry_from_wmemset SSE2 memset: movd %esi,%xmm0 mov %rdi,%rax punpcklbw %xmm0,%xmm0 punpcklwd %xmm0,%xmm0 pshufd $0x0,%xmm0,%xmm0 entry_from_wmemset: Since the ERMS versions of wmemset requires "rep stosl" instead of "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset is about 6X faster on Haswell. * include/wchar.h (__wmemset_chk): New. * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_CHK_SYMBOL): Likewise. (WMEMSET_SYMBOL): Likewise. (__wmemset): Add hidden definition. (wmemset): Add weak hidden definition. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add wmemset_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned, __wmemset_avx2_unaligned, __wmemset_avx512_unaligned, __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned and __wmemset_chk_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated. (WMEMSET_CHK_SYMBOL): New. (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise. (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise. * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New. (libc_hidden_builtin_def): Also define __GI_wmemset and __GI___wmemset. (weak_alias): New. * sysdeps/x86_64/multiarch/wmemset.c: New file. * sysdeps/x86_64/multiarch/wmemset.h: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk.c: Likewise. * sysdeps/x86_64/wmemset.c: Likewise. * sysdeps/x86_64/wmemset_chk.c: Likewise.
2017-06-05 18:09:48 +00:00
ifeq ($(subdir),debug)
sysdep_routines += \
memcpy_chk-nonshared \
memmove_chk-nonshared \
mempcpy_chk-nonshared \
memset_chk-nonshared \
wmemset_chk-nonshared \
# sysdep_routines
x86-64: Optimize wmemset with SSE2/AVX2/AVX512 The difference between memset and wmemset is byte vs int. Add stubs to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size: SSE2 wmemset: shl $0x2,%rdx movd %esi,%xmm0 mov %rdi,%rax pshufd $0x0,%xmm0,%xmm0 jmp entry_from_wmemset SSE2 memset: movd %esi,%xmm0 mov %rdi,%rax punpcklbw %xmm0,%xmm0 punpcklwd %xmm0,%xmm0 pshufd $0x0,%xmm0,%xmm0 entry_from_wmemset: Since the ERMS versions of wmemset requires "rep stosl" instead of "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset is about 6X faster on Haswell. * include/wchar.h (__wmemset_chk): New. * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_CHK_SYMBOL): Likewise. (WMEMSET_SYMBOL): Likewise. (__wmemset): Add hidden definition. (wmemset): Add weak hidden definition. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add wmemset_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned, __wmemset_avx2_unaligned, __wmemset_avx512_unaligned, __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned and __wmemset_chk_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated. (WMEMSET_CHK_SYMBOL): New. (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise. (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise. * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New. (libc_hidden_builtin_def): Also define __GI_wmemset and __GI___wmemset. (weak_alias): New. * sysdeps/x86_64/multiarch/wmemset.c: New file. * sysdeps/x86_64/multiarch/wmemset.h: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk.c: Likewise. * sysdeps/x86_64/wmemset.c: Likewise. * sysdeps/x86_64/wmemset_chk.c: Likewise.
2017-06-05 18:09:48 +00:00
endif