glibc/sysdeps/x86_64/multiarch/Makefile
Adhemerval Zanella 721314c980 x86_64: Remove avx512 strstr implementation
As indicated in a recent thread, this it is a simple brute-force
algorithm that checks the whole needle at a matching character pair
(and does so 1 byte at a time after the first 64 bytes of a needle).
Also it never skips ahead and thus can match at every haystack
position after trying to match all of the needle, which generic
implementation avoids.

As indicated by Wilco, a 4x larger needle and 16x larger haystack gives
a clear 65x slowdown both basic_strstr and __strstr_avx512:

  "ifuncs": ["basic_strstr", "twoway_strstr", "__strstr_avx512",
"__strstr_sse2_unaligned", "__strstr_generic"],

    {
     "len_haystack": 65536,
     "len_needle": 1024,
     "align_haystack": 0,
     "align_needle": 0,
     "fail": 1,
     "desc": "Difficult bruteforce needle",
     "timings": [4.0948e+07, 15094.5, 3.20818e+07, 108558, 10839.2]
    },
    {
     "len_haystack": 1048576,
     "len_needle": 4096,
     "align_haystack": 0,
     "align_needle": 0,
     "fail": 1,
     "desc": "Difficult bruteforce needle",
     "timings": [2.69767e+09, 100797, 2.08535e+09, 495706, 82666.9]
    }

PS: I don't have an AVX512 capable machine to verify this issues, but
    skimming through the code it does seems to follow what Wilco has
    described.
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
2024-03-27 13:48:16 -03:00

200 lines
3.8 KiB
Makefile

ifeq ($(subdir),string)
sysdep_routines += \
memchr-avx2 \
memchr-avx2-rtm \
memchr-evex \
memchr-evex-rtm \
memchr-evex512 \
memchr-sse2 \
memcmp-avx2-movbe \
memcmp-avx2-movbe-rtm \
memcmp-evex-movbe \
memcmp-sse2 \
memcmpeq-avx2 \
memcmpeq-avx2-rtm \
memcmpeq-evex \
memcmpeq-sse2 \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
memmove-avx512-unaligned-erms \
memmove-erms \
memmove-evex-unaligned-erms \
memmove-sse2-unaligned-erms \
memmove-ssse3 \
memrchr-avx2 \
memrchr-avx2-rtm \
memrchr-evex \
memrchr-sse2 \
memset-avx2-unaligned-erms \
memset-avx2-unaligned-erms-rtm \
memset-avx512-no-vzeroupper \
memset-avx512-unaligned-erms \
memset-erms \
memset-evex-unaligned-erms \
memset-sse2-unaligned-erms \
rawmemchr-avx2 \
rawmemchr-avx2-rtm \
rawmemchr-evex \
rawmemchr-evex-rtm \
rawmemchr-evex512 \
rawmemchr-sse2 \
stpcpy-avx2 \
stpcpy-avx2-rtm \
stpcpy-evex \
stpcpy-sse2 \
stpcpy-sse2-unaligned \
stpncpy-avx2 \
stpncpy-avx2-rtm \
stpncpy-evex \
stpncpy-sse2-unaligned \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
strcasecmp_l-evex \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcat-avx2 \
strcat-avx2-rtm \
strcat-evex \
strcat-sse2 \
strcat-sse2-unaligned \
strchr-avx2 \
strchr-avx2-rtm \
strchr-evex \
strchr-evex512 \
strchr-sse2 \
strchr-sse2-no-bsf \
strchrnul-avx2 \
strchrnul-avx2-rtm \
strchrnul-evex \
strchrnul-evex512 \
strchrnul-sse2 \
strcmp-avx2 \
strcmp-avx2-rtm \
strcmp-evex \
strcmp-sse2 \
strcmp-sse2-unaligned \
strcmp-sse4_2 \
strcpy-avx2 \
strcpy-avx2-rtm \
strcpy-evex \
strcpy-sse2 \
strcpy-sse2-unaligned \
strcspn-sse4 \
strlen-avx2 \
strlen-avx2-rtm \
strlen-evex \
strlen-evex512 \
strlen-sse2 \
strncase_l-avx2 \
strncase_l-avx2-rtm \
strncase_l-evex \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncat-avx2 \
strncat-avx2-rtm \
strncat-evex \
strncat-sse2-unaligned \
strncmp-avx2 \
strncmp-avx2-rtm \
strncmp-evex \
strncmp-sse2 \
strncmp-sse4_2 \
strncpy-avx2 \
strncpy-avx2-rtm \
strncpy-evex \
strncpy-sse2-unaligned \
strnlen-avx2 \
strnlen-avx2-rtm \
strnlen-evex \
strnlen-evex512 \
strnlen-sse2 \
strpbrk-sse4 \
strrchr-avx2 \
strrchr-avx2-rtm \
strrchr-evex \
strrchr-evex512 \
strrchr-sse2 \
strspn-sse4 \
strstr-sse2-unaligned \
varshift \
# sysdep_routines
CFLAGS-strcspn-sse4.c += -msse4
CFLAGS-strpbrk-sse4.c += -msse4
CFLAGS-strspn-sse4.c += -msse4
endif
ifeq ($(subdir),wcsmbs)
sysdep_routines += \
wcpcpy-avx2 \
wcpcpy-evex \
wcpcpy-generic \
wcpncpy-avx2 \
wcpncpy-evex \
wcpncpy-generic \
wcscat-avx2 \
wcscat-evex \
wcscat-generic \
wcschr-avx2 \
wcschr-avx2-rtm \
wcschr-evex \
wcschr-evex512 \
wcschr-sse2 \
wcscmp-avx2 \
wcscmp-avx2-rtm \
wcscmp-evex \
wcscmp-sse2 \
wcscpy-avx2 \
wcscpy-evex \
wcscpy-generic \
wcscpy-ssse3 \
wcslen-avx2 \
wcslen-avx2-rtm \
wcslen-evex \
wcslen-evex512 \
wcslen-sse2 \
wcslen-sse4_1 \
wcsncat-avx2 \
wcsncat-evex \
wcsncat-generic \
wcsncmp-avx2 \
wcsncmp-avx2-rtm \
wcsncmp-evex \
wcsncpy-avx2 \
wcsncpy-evex \
wcsncpy-generic \
wcsnlen-avx2 \
wcsnlen-avx2-rtm \
wcsnlen-evex \
wcsnlen-evex512 \
wcsnlen-sse4_1 \
wcsrchr-avx2 \
wcsrchr-avx2-rtm \
wcsrchr-evex \
wcsrchr-evex512 \
wcsrchr-sse2 \
wmemchr-avx2 \
wmemchr-avx2-rtm \
wmemchr-evex \
wmemchr-evex-rtm \
wmemchr-evex512 \
wmemchr-sse2 \
wmemcmp-avx2-movbe \
wmemcmp-avx2-movbe-rtm \
wmemcmp-evex-movbe \
wmemcmp-sse2 \
# sysdep_routines
endif
ifeq ($(subdir),debug)
sysdep_routines += \
memcpy_chk-nonshared \
memmove_chk-nonshared \
mempcpy_chk-nonshared \
memset_chk-nonshared \
wmemset_chk-nonshared \
# sysdep_routines
endif