mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-28 13:31:13 +00:00
1457016337
Optimize x86-64 strcmp/wcscmp and strncmp/wcsncmp with AVX2. It uses vector comparison as much as possible. Peak performance observed on a SkyLake machine: 9x, 3x, 2.5x and 5.5x for strcmp, strncmp, wcscmp and wcsncmp, respectively. The larger the comparison length, the more benefit using avx2 functions, except on the strcmp, where peak is observed at length == 32 bytes. Select AVX2 strcmp/wcscmp on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strcmp-avx2, strncmp-avx2, wcscmp-avx2, wcscmp-sse2, wcsncmp-avx2 and wcsncmp-sse2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strcmp_avx2, __strncmp_avx2, __wcscmp_avx2, __wcsncmp_avx2, __wcscmp_sse2 and __wcsncmp_sse2. * sysdeps/x86_64/multiarch/strcmp.c (OPTIMIZE (avx2)): (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if AVX unaligned load is fast and vzeroupper is preferred. * sysdeps/x86_64/multiarch/strncmp.c: Likewise. * sysdeps/x86_64/multiarch/strcmp-avx2.S: New file. * sysdeps/x86_64/multiarch/strncmp-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcscmp-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcscmp-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcscmp.c: Likewise. * sysdeps/x86_64/multiarch/wcsncmp-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsncmp-sse2.c: Likewise. * sysdeps/x86_64/multiarch/wcsncmp.c: Likewise. * sysdeps/x86_64/wcscmp.S (__wcscmp): Add alias only if __wcscmp is undefined.
61 lines
2.1 KiB
C
61 lines
2.1 KiB
C
/* Multiple versions of strncmp.
|
|
All versions must be listed in ifunc-impl-list.c.
|
|
Copyright (C) 2017-2018 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
/* Define multiple versions only for the definition in libc. */
|
|
#if IS_IN (libc)
|
|
# define strncmp __redirect_strncmp
|
|
# include <string.h>
|
|
# undef strncmp
|
|
|
|
# define SYMBOL_NAME strncmp
|
|
# include <init-arch.h>
|
|
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
|
|
static inline void *
|
|
IFUNC_SELECTOR (void)
|
|
{
|
|
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
|
|
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
|
&& CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
|
|
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
return OPTIMIZE (avx2);
|
|
|
|
if (CPU_FEATURES_CPU_P (cpu_features, SSE4_2)
|
|
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
|
|
return OPTIMIZE (sse42);
|
|
|
|
if (CPU_FEATURES_CPU_P (cpu_features, SSSE3))
|
|
return OPTIMIZE (ssse3);
|
|
|
|
return OPTIMIZE (sse2);
|
|
}
|
|
|
|
libc_ifunc_redirected (__redirect_strncmp, strncmp, IFUNC_SELECTOR ());
|
|
|
|
# ifdef SHARED
|
|
__hidden_ver1 (strncmp, __GI_strncmp, __redirect_strncmp)
|
|
__attribute__ ((visibility ("hidden")));
|
|
# endif
|
|
#endif
|