x86-64: Add AVX optimized string/memory functions for RTM

Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX
optimized string/memory functions with

	xtest
	jz	1f
	vzeroall
	ret
1:
	vzeroupper
	ret

at function exit on processors with usable RTM, but without 256-bit EVEX
instructions to avoid VZEROUPPER inside a transactionally executing RTM
region.
This commit is contained in:
H.J. Lu 2021-03-05 07:26:42 -08:00
parent 91264fe357
commit 7ebba91361
52 changed files with 670 additions and 248 deletions

View File

@ -40,6 +40,25 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memset-sse2-unaligned-erms \
memset-avx2-unaligned-erms \
memset-avx512-unaligned-erms \
memchr-avx2-rtm \
memcmp-avx2-movbe-rtm \
memmove-avx-unaligned-erms-rtm \
memrchr-avx2-rtm \
memset-avx2-unaligned-erms-rtm \
rawmemchr-avx2-rtm \
strchr-avx2-rtm \
strcmp-avx2-rtm \
strchrnul-avx2-rtm \
stpcpy-avx2-rtm \
stpncpy-avx2-rtm \
strcat-avx2-rtm \
strcpy-avx2-rtm \
strlen-avx2-rtm \
strncat-avx2-rtm \
strncmp-avx2-rtm \
strncpy-avx2-rtm \
strnlen-avx2-rtm \
strrchr-avx2-rtm \
memchr-evex \
memcmp-evex-movbe \
memmove-evex-unaligned-erms \
@ -76,6 +95,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wcsrchr-sse2 wcsrchr-avx2 \
wcsnlen-sse4_1 wcsnlen-c \
wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
wcschr-avx2-rtm \
wcscmp-avx2-rtm \
wcslen-avx2-rtm \
wcsncmp-avx2-rtm \
wcsnlen-avx2-rtm \
wcsrchr-avx2-rtm \
wmemchr-avx2-rtm \
wmemcmp-avx2-movbe-rtm \
wcschr-evex \
wcscmp-evex \
wcslen-evex \

View File

@ -21,6 +21,7 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
@ -36,6 +37,9 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURE_USABLE_P (cpu_features, BMI2))
return OPTIMIZE (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2);
}

View File

@ -43,6 +43,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memchr,
CPU_FEATURE_USABLE (AVX2),
__memchr_avx2)
IFUNC_IMPL_ADD (array, i, memchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__memchr_avx2_rtm)
IFUNC_IMPL_ADD (array, i, memchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -56,6 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_avx2_movbe)
IFUNC_IMPL_ADD (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (MOVBE)
&& CPU_FEATURE_USABLE (RTM)),
__memcmp_avx2_movbe_rtm)
IFUNC_IMPL_ADD (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -85,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
__memmove_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_chk_avx_unaligned_rtm)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_chk_avx_unaligned_erms_rtm)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_evex_unaligned)
@ -113,6 +130,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX),
__memmove_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_avx_unaligned_rtm)
IFUNC_IMPL_ADD (array, i, memmove,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_avx_unaligned_erms_rtm)
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_evex_unaligned)
@ -143,6 +168,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memrchr,
CPU_FEATURE_USABLE (AVX2),
__memrchr_avx2)
IFUNC_IMPL_ADD (array, i, memrchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__memrchr_avx2_rtm)
IFUNC_IMPL_ADD (array, i, memrchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@ -165,6 +194,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX2),
__memset_chk_avx2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__memset_chk_avx2_unaligned_rtm)
IFUNC_IMPL_ADD (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__memset_chk_avx2_unaligned_erms_rtm)
IFUNC_IMPL_ADD (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@ -198,6 +235,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset,
CPU_FEATURE_USABLE (AVX2),
__memset_avx2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__memset_avx2_unaligned_rtm)
IFUNC_IMPL_ADD (array, i, memset,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__memset_avx2_unaligned_erms_rtm)
IFUNC_IMPL_ADD (array, i, memset,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@ -222,6 +267,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, rawmemchr,
CPU_FEATURE_USABLE (AVX2),
__rawmemchr_avx2)
IFUNC_IMPL_ADD (array, i, rawmemchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__rawmemchr_avx2_rtm)
IFUNC_IMPL_ADD (array, i, rawmemchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -234,6 +283,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strlen,
CPU_FEATURE_USABLE (AVX2),
__strlen_avx2)
IFUNC_IMPL_ADD (array, i, strlen,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strlen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@ -245,6 +298,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strnlen,
CPU_FEATURE_USABLE (AVX2),
__strnlen_avx2)
IFUNC_IMPL_ADD (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strnlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@ -257,6 +314,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__stpncpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
__stpncpy_avx2)
IFUNC_IMPL_ADD (array, i, stpncpy,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__stpncpy_avx2_rtm)
IFUNC_IMPL_ADD (array, i, stpncpy,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@ -271,6 +332,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__stpcpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
__stpcpy_avx2)
IFUNC_IMPL_ADD (array, i, stpcpy,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__stpcpy_avx2_rtm)
IFUNC_IMPL_ADD (array, i, stpcpy,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@ -309,6 +374,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strcat,
IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2),
__strcat_avx2)
IFUNC_IMPL_ADD (array, i, strcat,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strcat_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcat,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@ -323,6 +392,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strchr,
CPU_FEATURE_USABLE (AVX2),
__strchr_avx2)
IFUNC_IMPL_ADD (array, i, strchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strchr_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -336,6 +409,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strchrnul,
CPU_FEATURE_USABLE (AVX2),
__strchrnul_avx2)
IFUNC_IMPL_ADD (array, i, strchrnul,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strchrnul_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strchrnul,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -348,6 +425,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strrchr,
CPU_FEATURE_USABLE (AVX2),
__strrchr_avx2)
IFUNC_IMPL_ADD (array, i, strrchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strrchr_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strrchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@ -359,6 +440,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcmp,
CPU_FEATURE_USABLE (AVX2),
__strcmp_avx2)
IFUNC_IMPL_ADD (array, i, strcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strcmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -375,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strcpy,
IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2),
__strcpy_avx2)
IFUNC_IMPL_ADD (array, i, strcpy,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strcpy_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcpy,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@ -422,6 +511,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strncat,
IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2),
__strncat_avx2)
IFUNC_IMPL_ADD (array, i, strncat,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strncat_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncat,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@ -436,6 +529,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strncpy,
IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2),
__strncpy_avx2)
IFUNC_IMPL_ADD (array, i, strncpy,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strncpy_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncpy,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@ -469,6 +566,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wcschr,
CPU_FEATURE_USABLE (AVX2),
__wcschr_avx2)
IFUNC_IMPL_ADD (array, i, wcschr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__wcschr_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcschr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -481,6 +582,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wcsrchr,
CPU_FEATURE_USABLE (AVX2),
__wcsrchr_avx2)
IFUNC_IMPL_ADD (array, i, wcsrchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__wcsrchr_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcsrchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -493,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wcscmp,
CPU_FEATURE_USABLE (AVX2),
__wcscmp_avx2)
IFUNC_IMPL_ADD (array, i, wcscmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__wcscmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcscmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -505,6 +614,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wcsncmp,
CPU_FEATURE_USABLE (AVX2),
__wcsncmp_avx2)
IFUNC_IMPL_ADD (array, i, wcsncmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__wcsncmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcsncmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -523,6 +636,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wcslen,
CPU_FEATURE_USABLE (AVX2),
__wcslen_avx2)
IFUNC_IMPL_ADD (array, i, wcslen,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__wcslen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcslen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -535,6 +652,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wcsnlen,
CPU_FEATURE_USABLE (AVX2),
__wcsnlen_avx2)
IFUNC_IMPL_ADD (array, i, wcsnlen,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__wcsnlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcsnlen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -550,6 +671,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wmemchr,
CPU_FEATURE_USABLE (AVX2),
__wmemchr_avx2)
IFUNC_IMPL_ADD (array, i, wmemchr,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__wmemchr_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wmemchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -563,6 +688,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_avx2_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (MOVBE)
&& CPU_FEATURE_USABLE (RTM)),
__wmemcmp_avx2_movbe_rtm)
IFUNC_IMPL_ADD (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@ -581,6 +711,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wmemset,
CPU_FEATURE_USABLE (AVX2),
__wmemset_avx2_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__wmemset_avx2_unaligned_rtm)
IFUNC_IMPL_ADD (array, i, wmemset,
CPU_FEATURE_USABLE (AVX512VL),
__wmemset_evex_unaligned)
@ -606,6 +740,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
__memcpy_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_chk_avx_unaligned_rtm)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_chk_avx_unaligned_erms_rtm)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_evex_unaligned)
@ -634,6 +776,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX),
__memcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_avx_unaligned_rtm)
IFUNC_IMPL_ADD (array, i, memcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_avx_unaligned_erms_rtm)
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_evex_unaligned)
@ -676,6 +826,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
__mempcpy_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_chk_avx_unaligned_rtm)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_chk_avx_unaligned_erms_rtm)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_evex_unaligned)
@ -713,6 +871,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
__mempcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_avx_unaligned_rtm)
IFUNC_IMPL_ADD (array, i, mempcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_avx_unaligned_erms_rtm)
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_evex_unaligned)
@ -734,6 +900,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncmp,
CPU_FEATURE_USABLE (AVX2),
__strncmp_avx2)
IFUNC_IMPL_ADD (array, i, strncmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (RTM)),
__strncmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),

View File

@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
static inline void *
@ -38,6 +39,9 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
return OPTIMIZE (evex_movbe);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_movbe_rtm);
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2_movbe);
}

View File

@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
@ -71,6 +75,14 @@ IFUNC_SELECTOR (void)
return OPTIMIZE (evex_unaligned);
}
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
{
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
return OPTIMIZE (avx_unaligned_erms_rtm);
return OPTIMIZE (avx_unaligned_rtm);
}
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
{
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))

View File

@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
@ -69,6 +73,14 @@ IFUNC_SELECTOR (void)
return OPTIMIZE (evex_unaligned);
}
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
{
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
return OPTIMIZE (avx2_unaligned_erms_rtm);
return OPTIMIZE (avx2_unaligned_rtm);
}
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
{
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))

View File

@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
@ -39,6 +40,9 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
return OPTIMIZE (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2);
}

View File

@ -20,6 +20,8 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
@ -39,6 +41,9 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
return OPTIMIZE (evex_unaligned);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_unaligned_rtm);
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2_unaligned);
}

View File

@ -0,0 +1,12 @@
#ifndef MEMCHR
# define MEMCHR __memchr_avx2_rtm
#endif
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
#define SECTION(p) p##.avx.rtm
#include "memchr-avx2.S"

View File

@ -34,9 +34,13 @@
# define VZEROUPPER vzeroupper
# endif
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
# define VEC_SIZE 32
.section .text.avx,"ax",@progbits
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
@ -107,8 +111,8 @@ L(cros_page_boundary):
# endif
addq %rdi, %rax
addq %rcx, %rax
VZEROUPPER
ret
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(aligned_more):
@ -224,8 +228,7 @@ L(last_4x_vec_or_less):
jnz L(first_vec_x3_check)
xorl %eax, %eax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(last_2x_vec):
@ -243,8 +246,7 @@ L(last_2x_vec):
testl %eax, %eax
jnz L(first_vec_x1_check)
xorl %eax, %eax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x0_check):
@ -253,8 +255,7 @@ L(first_vec_x0_check):
cmpq %rax, %rdx
jbe L(zero)
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1_check):
@ -264,8 +265,7 @@ L(first_vec_x1_check):
jbe L(zero)
addq $VEC_SIZE, %rax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2_check):
@ -275,8 +275,7 @@ L(first_vec_x2_check):
jbe L(zero)
addq $(VEC_SIZE * 2), %rax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x3_check):
@ -286,12 +285,14 @@ L(first_vec_x3_check):
jbe L(zero)
addq $(VEC_SIZE * 3), %rax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(zero):
VZEROUPPER
xorl %eax, %eax
jmp L(return_vzeroupper)
.p2align 4
L(null):
xorl %eax, %eax
ret
@ -301,24 +302,21 @@ L(null):
L(first_vec_x0):
tzcntl %eax, %eax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
addq $VEC_SIZE, %rax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
addq $(VEC_SIZE * 2), %rax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(4x_vec_end):
@ -337,8 +335,7 @@ L(first_vec_x3):
tzcntl %eax, %eax
addq $(VEC_SIZE * 3), %rax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
END (MEMCHR)
#endif

View File

@ -0,0 +1,12 @@
#ifndef MEMCMP
# define MEMCMP __memcmp_avx2_movbe_rtm
#endif
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
#define SECTION(p) p##.avx.rtm
#include "memcmp-avx2-movbe.S"

View File

@ -47,6 +47,10 @@
# define VZEROUPPER vzeroupper
# endif
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
# define VEC_SIZE 32
# define VEC_MASK ((1 << VEC_SIZE) - 1)
@ -55,7 +59,7 @@
memcmp has to use UNSIGNED comparison for elemnts.
*/
.section .text.avx,"ax",@progbits
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
shl $2, %RDX_LP
@ -123,8 +127,8 @@ ENTRY (MEMCMP)
vptest %ymm0, %ymm5
jnc L(4x_vec_end)
xorl %eax, %eax
VZEROUPPER
ret
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(last_2x_vec):
@ -144,8 +148,7 @@ L(last_vec):
vpmovmskb %ymm2, %eax
subl $VEC_MASK, %eax
jnz L(first_vec)
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec):
@ -164,8 +167,7 @@ L(wmemcmp_return):
movzbl (%rsi, %rcx), %edx
sub %edx, %eax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
# ifdef USE_AS_WMEMCMP
.p2align 4
@ -367,8 +369,7 @@ L(last_4x_vec):
vpmovmskb %ymm2, %eax
subl $VEC_MASK, %eax
jnz L(first_vec)
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(4x_vec_end):
@ -394,8 +395,7 @@ L(4x_vec_end):
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
sub %edx, %eax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1):
@ -410,8 +410,7 @@ L(first_vec_x1):
movzbl VEC_SIZE(%rsi, %rcx), %edx
sub %edx, %eax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2):
@ -426,7 +425,6 @@ L(first_vec_x2):
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
sub %edx, %eax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
END (MEMCMP)
#endif

View File

@ -0,0 +1,17 @@
#if IS_IN (libc)
# define VEC_SIZE 32
# define VEC(i) ymm##i
# define VMOVNT vmovntdq
# define VMOVU vmovdqu
# define VMOVA vmovdqa
# define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
# define VZEROUPPER_RETURN jmp L(return)
# define SECTION(p) p##.avx.rtm
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm
# include "memmove-vec-unaligned-erms.S"
#endif

View File

@ -150,11 +150,12 @@ L(last_2x_vec):
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
VZEROUPPER
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(nop):
#endif
ret
#else
VZEROUPPER_RETURN
#endif
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMMOVE_SYMBOL (__memmove, unaligned))
@ -247,8 +248,11 @@ L(last_2x_vec):
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
L(return):
VZEROUPPER
#if VEC_SIZE > 16
ZERO_UPPER_VEC_REGISTERS_RETURN
#else
ret
#endif
L(movsb):
cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
@ -313,8 +317,7 @@ L(between_32_63):
VMOVU -32(%rsi,%rdx), %YMM1
VMOVU %YMM0, (%rdi)
VMOVU %YMM1, -32(%rdi,%rdx)
VZEROUPPER
ret
VZEROUPPER_RETURN
#endif
#if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
@ -323,7 +326,7 @@ L(between_16_31):
VMOVU -16(%rsi,%rdx), %XMM1
VMOVU %XMM0, (%rdi)
VMOVU %XMM1, -16(%rdi,%rdx)
ret
VZEROUPPER_RETURN
#endif
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
@ -376,8 +379,7 @@ L(more_2x_vec):
VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
VZEROUPPER
ret
VZEROUPPER_RETURN
L(last_4x_vec):
/* Copy from 2 * VEC to 4 * VEC. */
VMOVU (%rsi), %VEC(0)
@ -388,8 +390,7 @@ L(last_4x_vec):
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
VZEROUPPER
ret
VZEROUPPER_RETURN
L(more_8x_vec):
cmpq %rsi, %rdi
@ -445,8 +446,7 @@ L(loop_4x_vec_forward):
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
/* Store the first VEC. */
VMOVU %VEC(4), (%r11)
VZEROUPPER
ret
VZEROUPPER_RETURN
L(more_8x_vec_backward):
/* Load the first 4 * VEC and last VEC to support overlapping
@ -497,8 +497,7 @@ L(loop_4x_vec_backward):
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
/* Store the last VEC. */
VMOVU %VEC(8), (%r11)
VZEROUPPER
ret
VZEROUPPER_RETURN
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
L(large_forward):
@ -533,8 +532,7 @@ L(loop_large_forward):
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
/* Store the first VEC. */
VMOVU %VEC(4), (%r11)
VZEROUPPER
ret
VZEROUPPER_RETURN
L(large_backward):
/* Don't use non-temporal store if there is overlap between
@ -568,8 +566,7 @@ L(loop_large_backward):
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
/* Store the last VEC. */
VMOVU %VEC(8), (%r11)
VZEROUPPER
ret
VZEROUPPER_RETURN
#endif
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))

View File

@ -0,0 +1,12 @@
#ifndef MEMRCHR
# define MEMRCHR __memrchr_avx2_rtm
#endif
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
#define SECTION(p) p##.avx.rtm
#include "memrchr-avx2.S"

View File

@ -20,14 +20,22 @@
# include <sysdep.h>
# ifndef MEMRCHR
# define MEMRCHR __memrchr_avx2
# endif
# ifndef VZEROUPPER
# define VZEROUPPER vzeroupper
# endif
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
# define VEC_SIZE 32
.section .text.avx,"ax",@progbits
ENTRY (__memrchr_avx2)
.section SECTION(.text),"ax",@progbits
ENTRY (MEMRCHR)
/* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
vpbroadcastb %xmm0, %ymm0
@ -134,8 +142,8 @@ L(loop_4x_vec):
vpmovmskb %ymm1, %eax
bsrl %eax, %eax
addq %rdi, %rax
VZEROUPPER
ret
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(last_4x_vec_or_less):
@ -169,8 +177,7 @@ L(last_4x_vec_or_less):
addq %rax, %rdx
jl L(zero)
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(last_2x_vec):
@ -191,31 +198,27 @@ L(last_2x_vec):
jl L(zero)
addl $(VEC_SIZE * 2), %eax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(last_vec_x0):
bsrl %eax, %eax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(last_vec_x1):
bsrl %eax, %eax
addl $VEC_SIZE, %eax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(last_vec_x2):
bsrl %eax, %eax
addl $(VEC_SIZE * 2), %eax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(last_vec_x3):
@ -232,8 +235,7 @@ L(last_vec_x1_check):
jl L(zero)
addl $VEC_SIZE, %eax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(last_vec_x3_check):
@ -243,12 +245,14 @@ L(last_vec_x3_check):
jl L(zero)
addl $(VEC_SIZE * 3), %eax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(zero):
VZEROUPPER
xorl %eax, %eax
VZEROUPPER_RETURN
.p2align 4
L(null):
xorl %eax, %eax
ret
@ -273,8 +277,7 @@ L(last_vec_or_less_aligned):
bsrl %eax, %eax
addq %rdi, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(last_vec_or_less):
@ -315,8 +318,7 @@ L(last_vec_or_less):
bsrl %eax, %eax
addq %rdi, %rax
addq %r8, %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(last_vec_2x_aligned):
@ -353,7 +355,6 @@ L(last_vec_2x_aligned):
bsrl %eax, %eax
addq %rdi, %rax
addq %r8, %rax
VZEROUPPER
ret
END (__memrchr_avx2)
VZEROUPPER_RETURN
END (MEMRCHR)
#endif

View File

@ -0,0 +1,10 @@
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
#define VZEROUPPER_RETURN jmp L(return)
#define SECTION(p) p##.avx.rtm
#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
#include "memset-avx2-unaligned-erms.S"

View File

@ -14,9 +14,15 @@
movq r, %rax; \
vpbroadcastd %xmm0, %ymm0
# define SECTION(p) p##.avx
# define MEMSET_SYMBOL(p,s) p##_avx2_##s
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
# ifndef MEMSET_SYMBOL
# define MEMSET_SYMBOL(p,s) p##_avx2_##s
# endif
# ifndef WMEMSET_SYMBOL
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# endif
# include "memset-vec-unaligned-erms.S"
#endif

View File

@ -45,17 +45,14 @@
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
# define VZEROUPPER_SHORT_RETURN vzeroupper; ret
# else
# define VZEROUPPER
# endif
#endif
#ifndef VZEROUPPER_SHORT_RETURN
# if VEC_SIZE > 16
# define VZEROUPPER_SHORT_RETURN vzeroupper
# else
# define VZEROUPPER_SHORT_RETURN rep
# endif
# define VZEROUPPER_SHORT_RETURN rep; ret
#endif
#ifndef MOVQ
@ -117,8 +114,7 @@ L(entry_from_bzero):
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), (%rdi)
VZEROUPPER
ret
VZEROUPPER_RETURN
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMSET_SYMBOL (__memset, unaligned))
@ -141,14 +137,12 @@ ENTRY (__memset_erms)
ENTRY (MEMSET_SYMBOL (__memset, erms))
# endif
L(stosb):
/* Issue vzeroupper before rep stosb. */
VZEROUPPER
mov %RDX_LP, %RCX_LP
movzbl %sil, %eax
mov %RDI_LP, %RDX_LP
rep stosb
mov %RDX_LP, %RAX_LP
ret
VZEROUPPER_RETURN
# if VEC_SIZE == 16
END (__memset_erms)
# else
@ -175,8 +169,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), (%rdi)
VZEROUPPER
ret
VZEROUPPER_RETURN
L(stosb_more_2x_vec):
cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
@ -190,8 +183,11 @@ L(more_2x_vec):
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
L(return):
VZEROUPPER
#if VEC_SIZE > 16
ZERO_UPPER_VEC_REGISTERS_RETURN
#else
ret
#endif
L(loop_start):
leaq (VEC_SIZE * 4)(%rdi), %rcx
@ -217,7 +213,6 @@ L(loop):
cmpq %rcx, %rdx
jne L(loop)
VZEROUPPER_SHORT_RETURN
ret
L(less_vec):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
@ -241,40 +236,34 @@ L(less_vec):
jb 1f
movb %cl, (%rdi)
1:
VZEROUPPER
ret
VZEROUPPER_RETURN
# if VEC_SIZE > 32
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
VMOVU %YMM0, -32(%rdi,%rdx)
VMOVU %YMM0, (%rdi)
VZEROUPPER
ret
VZEROUPPER_RETURN
# endif
# if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
VMOVU %XMM0, -16(%rdi,%rdx)
VMOVU %XMM0, (%rdi)
VZEROUPPER
ret
VZEROUPPER_RETURN
# endif
/* From 8 to 15. No branch when size == 8. */
L(between_8_15):
movq %rcx, -8(%rdi,%rdx)
movq %rcx, (%rdi)
VZEROUPPER
ret
VZEROUPPER_RETURN
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
movl %ecx, -4(%rdi,%rdx)
movl %ecx, (%rdi)
VZEROUPPER
ret
VZEROUPPER_RETURN
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
movw %cx, -2(%rdi,%rdx)
movw %cx, (%rdi)
VZEROUPPER
ret
VZEROUPPER_RETURN
END (MEMSET_SYMBOL (__memset, unaligned_erms))

View File

@ -0,0 +1,4 @@
#define MEMCHR __rawmemchr_avx2_rtm
#define USE_AS_RAWMEMCHR 1
#include "memchr-avx2-rtm.S"

View File

@ -0,0 +1,3 @@
#define USE_AS_STPCPY
#define STRCPY __stpcpy_avx2_rtm
#include "strcpy-avx2-rtm.S"

View File

@ -0,0 +1,4 @@
#define USE_AS_STPCPY
#define USE_AS_STRNCPY
#define STRCPY __stpncpy_avx2_rtm
#include "strcpy-avx2-rtm.S"

View File

@ -0,0 +1,12 @@
#ifndef STRCAT
# define STRCAT __strcat_avx2_rtm
#endif
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
#define SECTION(p) p##.avx.rtm
#include "strcat-avx2.S"

View File

@ -30,7 +30,11 @@
/* Number of bytes in a vector register */
# define VEC_SIZE 32
.section .text.avx,"ax",@progbits
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
.section SECTION(.text),"ax",@progbits
ENTRY (STRCAT)
mov %rdi, %r9
# ifdef USE_AS_STRNCAT

View File

@ -0,0 +1,12 @@
#ifndef STRCHR
# define STRCHR __strchr_avx2_rtm
#endif
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
#define SECTION(p) p##.avx.rtm
#include "strchr-avx2.S"

View File

@ -40,10 +40,14 @@
# define VZEROUPPER vzeroupper
# endif
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
# define VEC_SIZE 32
# define PAGE_SIZE 4096
.section .text.avx,"ax",@progbits
.section SECTION(.text),"ax",@progbits
ENTRY (STRCHR)
movl %edi, %ecx
# ifndef USE_AS_STRCHRNUL
@ -76,8 +80,8 @@ ENTRY (STRCHR)
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(more_vecs):
@ -126,8 +130,7 @@ L(aligned_more):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x0):
@ -138,8 +141,7 @@ L(first_vec_x0):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1):
@ -149,8 +151,7 @@ L(first_vec_x1):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2):
@ -161,8 +162,7 @@ L(first_vec_x2):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
L(prep_loop_4x):
/* Align data to 4 * VEC_SIZE. */
@ -221,8 +221,7 @@ L(loop_4x_vec):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
/* Cold case for crossing page with first load. */
.p2align 4
@ -246,8 +245,7 @@ L(cross_page_boundary):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
END (STRCHR)
# endif

View File

@ -29,6 +29,7 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
return OPTIMIZE (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2);
}

View File

@ -0,0 +1,3 @@
#define STRCHR __strchrnul_avx2_rtm
#define USE_AS_STRCHRNUL 1
#include "strchr-avx2-rtm.S"

View File

@ -0,0 +1,12 @@
#ifndef STRCMP
# define STRCMP __strcmp_avx2_rtm
#endif
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
#define SECTION(p) p##.avx.rtm
#include "strcmp-avx2.S"

View File

@ -55,6 +55,10 @@
# define VZEROUPPER vzeroupper
# endif
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
strcmp/strncmp have to use UNSIGNED comparison for elements.
@ -75,7 +79,7 @@
the maximum offset is reached before a difference is found, zero is
returned. */
.section .text.avx,"ax",@progbits
.section SECTION(.text),"ax",@progbits
ENTRY (STRCMP)
# ifdef USE_AS_STRNCMP
/* Check for simple cases (0 or 1) in offset. */
@ -127,8 +131,8 @@ L(return):
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
# endif
VZEROUPPER
ret
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(return_vec_size):
@ -161,8 +165,7 @@ L(return_vec_size):
subl %edx, %eax
# endif
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(return_2_vec_size):
@ -195,8 +198,7 @@ L(return_2_vec_size):
subl %edx, %eax
# endif
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(return_3_vec_size):
@ -229,8 +231,7 @@ L(return_3_vec_size):
subl %edx, %eax
# endif
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(next_3_vectors):
@ -356,8 +357,7 @@ L(back_to_loop):
subl %edx, %eax
# endif
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(test_vec):
@ -400,8 +400,7 @@ L(test_vec):
subl %edx, %eax
# endif
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(test_2_vec):
@ -444,8 +443,7 @@ L(test_2_vec):
subl %edx, %eax
# endif
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(test_3_vec):
@ -486,8 +484,7 @@ L(test_3_vec):
subl %edx, %eax
# endif
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(loop_cross_page):
@ -556,8 +553,7 @@ L(loop_cross_page):
subl %edx, %eax
# endif
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(loop_cross_page_2_vec):
@ -631,8 +627,7 @@ L(loop_cross_page_2_vec):
subl %edx, %eax
# endif
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
# ifdef USE_AS_STRNCMP
L(string_nbyte_offset_check):
@ -674,8 +669,7 @@ L(cross_page_loop):
# ifndef USE_AS_WCSCMP
L(different):
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
# ifdef USE_AS_WCSCMP
.p2align 4
@ -685,16 +679,14 @@ L(different):
setl %al
negl %eax
orl $1, %eax
VZEROUPPER
ret
VZEROUPPER_RETURN
# endif
# ifdef USE_AS_STRNCMP
.p2align 4
L(zero):
xorl %eax, %eax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(char0):
@ -708,8 +700,7 @@ L(char0):
movzbl (%rdi), %eax
subl %ecx, %eax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
# endif
.p2align 4
@ -734,8 +725,7 @@ L(last_vector):
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
/* Comparing on page boundary region requires special treatment:
It must done one vector at the time, starting with the wider
@ -856,7 +846,6 @@ L(cross_page_4bytes):
testl %eax, %eax
jne L(cross_page_loop)
subl %ecx, %eax
VZEROUPPER
ret
VZEROUPPER_RETURN
END (STRCMP)
#endif

View File

@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
return OPTIMIZE (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2);
}

View File

@ -0,0 +1,12 @@
#ifndef STRCPY
# define STRCPY __strcpy_avx2_rtm
#endif
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
#define SECTION(p) p##.avx.rtm
#include "strcpy-avx2.S"

View File

@ -37,6 +37,10 @@
# define VZEROUPPER vzeroupper
# endif
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
/* zero register */
#define xmmZ xmm0
#define ymmZ ymm0
@ -46,7 +50,7 @@
# ifndef USE_AS_STRCAT
.section .text.avx,"ax",@progbits
.section SECTION(.text),"ax",@progbits
ENTRY (STRCPY)
# ifdef USE_AS_STRNCPY
mov %RDX_LP, %R8_LP
@ -369,8 +373,8 @@ L(CopyVecSizeExit):
lea 1(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER
ret
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(CopyTwoVecSize1):
@ -553,8 +557,7 @@ L(Exit1):
lea 2(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(Exit2):
@ -569,8 +572,7 @@ L(Exit2):
lea 3(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(Exit3):
@ -584,8 +586,7 @@ L(Exit3):
lea 4(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(Exit4_7):
@ -602,8 +603,7 @@ L(Exit4_7):
lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(Exit8_15):
@ -620,8 +620,7 @@ L(Exit8_15):
lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(Exit16_31):
@ -638,8 +637,7 @@ L(Exit16_31):
lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(Exit32_63):
@ -656,8 +654,7 @@ L(Exit32_63):
lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
# ifdef USE_AS_STRNCPY
@ -671,8 +668,7 @@ L(StrncpyExit1):
# ifdef USE_AS_STRCAT
movb $0, 1(%rdi)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit2):
@ -684,8 +680,7 @@ L(StrncpyExit2):
# ifdef USE_AS_STRCAT
movb $0, 2(%rdi)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit3_4):
@ -699,8 +694,7 @@ L(StrncpyExit3_4):
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit5_8):
@ -714,8 +708,7 @@ L(StrncpyExit5_8):
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit9_16):
@ -729,8 +722,7 @@ L(StrncpyExit9_16):
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit17_32):
@ -744,8 +736,7 @@ L(StrncpyExit17_32):
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit33_64):
@ -760,8 +751,7 @@ L(StrncpyExit33_64):
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit65):
@ -778,50 +768,43 @@ L(StrncpyExit65):
# ifdef USE_AS_STRCAT
movb $0, 65(%rdi)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
# ifndef USE_AS_STRCAT
.p2align 4
L(Fill1):
mov %dl, (%rdi)
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(Fill2):
mov %dx, (%rdi)
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(Fill3_4):
mov %dx, (%rdi)
mov %dx, -2(%rdi, %r8)
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(Fill5_8):
mov %edx, (%rdi)
mov %edx, -4(%rdi, %r8)
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(Fill9_16):
mov %rdx, (%rdi)
mov %rdx, -8(%rdi, %r8)
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(Fill17_32):
vmovdqu %xmmZ, (%rdi)
vmovdqu %xmmZ, -16(%rdi, %r8)
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(CopyVecSizeUnalignedVec2):
@ -898,8 +881,7 @@ L(Fill):
cmp $1, %r8d
ja L(Fill2)
je L(Fill1)
VZEROUPPER
ret
VZEROUPPER_RETURN
/* end of ifndef USE_AS_STRCAT */
# endif
@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3):
# ifdef USE_AS_STRCAT
movb $0, (VEC_SIZE * 4)(%rdi)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(UnalignedFourVecSizeLeaveCase2):
@ -1001,16 +982,14 @@ L(StrncpyExit):
# ifdef USE_AS_STRCAT
movb $0, (%rdi)
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(ExitZero):
# ifndef USE_AS_STRCAT
mov %rdi, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
# endif

View File

@ -0,0 +1,12 @@
#ifndef STRLEN
# define STRLEN __strlen_avx2_rtm
#endif
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
#define SECTION(p) p##.avx.rtm
#include "strlen-avx2.S"

View File

@ -36,9 +36,13 @@
# define VZEROUPPER vzeroupper
# endif
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
# define VEC_SIZE 32
.section .text.avx,"ax",@progbits
.section SECTION(.text),"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
/* Check for zero length. */
@ -111,8 +115,8 @@ L(cros_page_boundary):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER
ret
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(aligned_more):
@ -231,8 +235,7 @@ L(last_4x_vec_or_less):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(last_2x_vec):
@ -253,8 +256,7 @@ L(last_2x_vec):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x0_check):
@ -267,8 +269,7 @@ L(first_vec_x0_check):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1_check):
@ -282,8 +283,7 @@ L(first_vec_x1_check):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2_check):
@ -297,8 +297,7 @@ L(first_vec_x2_check):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x3_check):
@ -312,8 +311,7 @@ L(first_vec_x3_check):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(max):
@ -321,8 +319,7 @@ L(max):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(zero):
@ -338,8 +335,7 @@ L(first_vec_x0):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1):
@ -350,8 +346,7 @@ L(first_vec_x1):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2):
@ -362,8 +357,7 @@ L(first_vec_x2):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(4x_vec_end):
@ -389,8 +383,7 @@ L(first_vec_x3):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER
ret
VZEROUPPER_RETURN
END (STRLEN)
#endif

View File

@ -0,0 +1,3 @@
#define USE_AS_STRNCAT
#define STRCAT __strncat_avx2_rtm
#include "strcat-avx2-rtm.S"

View File

@ -0,0 +1,3 @@
#define STRCMP __strncmp_avx2_rtm
#define USE_AS_STRNCMP 1
#include "strcmp-avx2-rtm.S"

View File

@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
return OPTIMIZE (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2);
}

View File

@ -0,0 +1,3 @@
#define USE_AS_STRNCPY
#define STRCPY __strncpy_avx2_rtm
#include "strcpy-avx2-rtm.S"

View File

@ -0,0 +1,4 @@
#define STRLEN __strnlen_avx2_rtm
#define USE_AS_STRNLEN 1
#include "strlen-avx2-rtm.S"

View File

@ -0,0 +1,12 @@
#ifndef STRRCHR
# define STRRCHR __strrchr_avx2_rtm
#endif
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
#define SECTION(p) p##.avx.rtm
#include "strrchr-avx2.S"

View File

@ -36,9 +36,13 @@
# define VZEROUPPER vzeroupper
# endif
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
# define VEC_SIZE 32
.section .text.avx,"ax",@progbits
.section SECTION(.text),"ax",@progbits
ENTRY (STRRCHR)
movd %esi, %xmm4
movl %edi, %ecx
@ -166,8 +170,8 @@ L(return_value):
# endif
bsrl %eax, %eax
leaq -VEC_SIZE(%rdi, %rax), %rax
VZEROUPPER
ret
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(match):
@ -198,8 +202,7 @@ L(find_nul):
jz L(return_value)
bsrl %eax, %eax
leaq -VEC_SIZE(%rdi, %rax), %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(char_and_nul):
@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec):
jz L(return_null)
bsrl %eax, %eax
leaq -VEC_SIZE(%rdi, %rax), %rax
VZEROUPPER
ret
VZEROUPPER_RETURN
.p2align 4
L(return_null):
xorl %eax, %eax
VZEROUPPER
ret
VZEROUPPER_RETURN
END (STRRCHR)
#endif

View File

@ -0,0 +1,3 @@
#define STRCHR __wcschr_avx2_rtm
#define USE_AS_WCSCHR 1
#include "strchr-avx2-rtm.S"

View File

@ -0,0 +1,4 @@
#define STRCMP __wcscmp_avx2_rtm
#define USE_AS_WCSCMP 1
#include "strcmp-avx2-rtm.S"

View File

@ -0,0 +1,4 @@
#define STRLEN __wcslen_avx2_rtm
#define USE_AS_WCSLEN 1
#include "strlen-avx2-rtm.S"

View File

@ -0,0 +1,5 @@
#define STRCMP __wcsncmp_avx2_rtm
#define USE_AS_STRNCMP 1
#define USE_AS_WCSCMP 1
#include "strcmp-avx2-rtm.S"

View File

@ -0,0 +1,5 @@
#define STRLEN __wcsnlen_avx2_rtm
#define USE_AS_WCSLEN 1
#define USE_AS_STRNLEN 1
#include "strlen-avx2-rtm.S"

View File

@ -29,6 +29,7 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURE_USABLE_P (cpu_features, BMI2))
return OPTIMIZE (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2);
}

View File

@ -0,0 +1,3 @@
#define STRRCHR __wcsrchr_avx2_rtm
#define USE_AS_WCSRCHR 1
#include "strrchr-avx2-rtm.S"

View File

@ -0,0 +1,4 @@
#define MEMCHR __wmemchr_avx2_rtm
#define USE_AS_WMEMCHR 1
#include "memchr-avx2-rtm.S"

View File

@ -0,0 +1,4 @@
#define MEMCMP __wmemcmp_avx2_movbe_rtm
#define USE_AS_WMEMCMP 1
#include "memcmp-avx2-movbe-rtm.S"

View File

@ -95,6 +95,28 @@ lose: \
#define R14_LP r14
#define R15_LP r15
/* Zero upper vector registers and return with xtest. NB: Use VZEROALL
to avoid RTM abort triggered by VZEROUPPER inside transactionally. */
#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
xtest; \
jz 1f; \
vzeroall; \
ret; \
1: \
vzeroupper; \
ret
/* Zero upper vector registers and return. */
#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
# define ZERO_UPPER_VEC_REGISTERS_RETURN \
VZEROUPPER; \
ret
#endif
#ifndef VZEROUPPER_RETURN
# define VZEROUPPER_RETURN VZEROUPPER; ret
#endif
#else /* __ASSEMBLER__ */
/* Long and pointer size in bytes. */