mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-27 05:00:15 +00:00
7cbc03d030
Code didn't actually use any sse4 instructions since `ptest` was
removed in:
commit 2f9062d717
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed Nov 10 16:18:56 2021 -0600
x86: Shrink memcmp-sse4.S code size
The new memcmp-sse2 implementation is also faster.
geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
Note there are two regressions preferring SSE2 for Size = 1 and Size =
65.
Size = 1:
size, align0, align1, ret, New Time/Old Time
1, 1, 1, 0, 1.2
1, 1, 1, 1, 1.197
1, 1, 1, -1, 1.2
This is intentional. Size == 1 is significantly less hot based on
profiles of GCC11 and Python3 than sizes [4, 8] (which is made
hotter).
Python3 Size = 1 -> 13.64%
Python3 Size = [4, 8] -> 60.92%
GCC11 Size = 1 -> 1.29%
GCC11 Size = [4, 8] -> 33.86%
size, align0, align1, ret, New Time/Old Time
4, 4, 4, 0, 0.622
4, 4, 4, 1, 0.797
4, 4, 4, -1, 0.805
5, 5, 5, 0, 0.623
5, 5, 5, 1, 0.777
5, 5, 5, -1, 0.802
6, 6, 6, 0, 0.625
6, 6, 6, 1, 0.813
6, 6, 6, -1, 0.788
7, 7, 7, 0, 0.625
7, 7, 7, 1, 0.799
7, 7, 7, -1, 0.795
8, 8, 8, 0, 0.625
8, 8, 8, 1, 0.848
8, 8, 8, -1, 0.914
9, 9, 9, 0, 0.625
Size = 65:
size, align0, align1, ret, New Time/Old Time
65, 0, 0, 0, 1.103
65, 0, 0, 1, 1.216
65, 0, 0, -1, 1.227
65, 65, 0, 0, 1.091
65, 0, 65, 1, 1.19
65, 65, 65, -1, 1.215
This is because A) the checks in range [65, 96] are now unrolled 2x
and B) because smaller values <= 16 are now given a hotter path. By
contrast the SSE4 version has a branch for Size = 80. The unrolled
version has get better performance for returns which need both
comparisons.
size, align0, align1, ret, New Time/Old Time
128, 4, 8, 0, 0.858
128, 4, 8, 1, 0.879
128, 4, 8, -1, 0.888
As well, out of microbenchmark environments that are not full
predictable the branch will have a real-cost.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
178 lines
3.4 KiB
Makefile
178 lines
3.4 KiB
Makefile
ifeq ($(subdir),string)
|
|
|
|
sysdep_routines += \
|
|
bzero \
|
|
memchr-avx2 \
|
|
memchr-avx2-rtm \
|
|
memchr-evex \
|
|
memchr-evex-rtm \
|
|
memchr-sse2 \
|
|
memcmp-avx2-movbe \
|
|
memcmp-avx2-movbe-rtm \
|
|
memcmp-evex-movbe \
|
|
memcmp-sse2 \
|
|
memcmpeq-avx2 \
|
|
memcmpeq-avx2-rtm \
|
|
memcmpeq-evex \
|
|
memcmpeq-sse2 \
|
|
memmove-avx-unaligned-erms \
|
|
memmove-avx-unaligned-erms-rtm \
|
|
memmove-avx512-no-vzeroupper \
|
|
memmove-avx512-unaligned-erms \
|
|
memmove-evex-unaligned-erms \
|
|
memmove-sse2-unaligned-erms \
|
|
memmove-ssse3 \
|
|
memrchr-avx2 \
|
|
memrchr-avx2-rtm \
|
|
memrchr-evex \
|
|
memrchr-sse2 \
|
|
memset-avx2-unaligned-erms \
|
|
memset-avx2-unaligned-erms-rtm \
|
|
memset-avx512-no-vzeroupper \
|
|
memset-avx512-unaligned-erms \
|
|
memset-evex-unaligned-erms \
|
|
memset-sse2-unaligned-erms \
|
|
rawmemchr-avx2 \
|
|
rawmemchr-avx2-rtm \
|
|
rawmemchr-evex \
|
|
rawmemchr-evex-rtm \
|
|
rawmemchr-sse2 \
|
|
stpcpy-avx2 \
|
|
stpcpy-avx2-rtm \
|
|
stpcpy-evex \
|
|
stpcpy-sse2 \
|
|
stpcpy-sse2-unaligned \
|
|
stpncpy-avx2 \
|
|
stpncpy-avx2-rtm \
|
|
stpncpy-c \
|
|
stpncpy-evex \
|
|
stpncpy-sse2-unaligned \
|
|
strcasecmp_l-avx2 \
|
|
strcasecmp_l-avx2-rtm \
|
|
strcasecmp_l-evex \
|
|
strcasecmp_l-sse2 \
|
|
strcasecmp_l-sse4_2 \
|
|
strcat-avx2 \
|
|
strcat-avx2-rtm \
|
|
strcat-evex \
|
|
strcat-sse2 \
|
|
strcat-sse2-unaligned \
|
|
strchr-avx2 \
|
|
strchr-avx2-rtm \
|
|
strchr-evex \
|
|
strchr-sse2 \
|
|
strchr-sse2-no-bsf \
|
|
strchrnul-avx2 \
|
|
strchrnul-avx2-rtm \
|
|
strchrnul-evex \
|
|
strchrnul-sse2 \
|
|
strcmp-avx2 \
|
|
strcmp-avx2-rtm \
|
|
strcmp-evex \
|
|
strcmp-sse2 \
|
|
strcmp-sse2-unaligned \
|
|
strcmp-sse4_2 \
|
|
strcpy-avx2 \
|
|
strcpy-avx2-rtm \
|
|
strcpy-evex \
|
|
strcpy-sse2 \
|
|
strcpy-sse2-unaligned \
|
|
strcspn-c \
|
|
strcspn-sse2 \
|
|
strlen-avx2 \
|
|
strlen-avx2-rtm \
|
|
strlen-evex \
|
|
strlen-sse2 \
|
|
strncase_l-avx2 \
|
|
strncase_l-avx2-rtm \
|
|
strncase_l-evex \
|
|
strncase_l-sse2 \
|
|
strncase_l-sse4_2 \
|
|
strncat-avx2 \
|
|
strncat-avx2-rtm \
|
|
strncat-c \
|
|
strncat-evex \
|
|
strncat-sse2-unaligned \
|
|
strncmp-avx2 \
|
|
strncmp-avx2-rtm \
|
|
strncmp-evex \
|
|
strncmp-sse2 \
|
|
strncmp-sse4_2 \
|
|
strncpy-avx2 \
|
|
strncpy-avx2-rtm \
|
|
strncpy-c \
|
|
strncpy-evex \
|
|
strncpy-sse2-unaligned \
|
|
strnlen-avx2 \
|
|
strnlen-avx2-rtm \
|
|
strnlen-evex \
|
|
strnlen-sse2 \
|
|
strpbrk-c \
|
|
strpbrk-sse2 \
|
|
strrchr-avx2 \
|
|
strrchr-avx2-rtm \
|
|
strrchr-evex \
|
|
strrchr-sse2 \
|
|
strspn-c \
|
|
strspn-sse2 \
|
|
strstr-sse2-unaligned \
|
|
varshift \
|
|
# sysdep_routines
|
|
CFLAGS-varshift.c += -msse4
|
|
CFLAGS-strcspn-c.c += -msse4
|
|
CFLAGS-strpbrk-c.c += -msse4
|
|
CFLAGS-strspn-c.c += -msse4
|
|
endif
|
|
|
|
ifeq ($(subdir),wcsmbs)
|
|
sysdep_routines += \
|
|
wcschr-avx2 \
|
|
wcschr-avx2-rtm \
|
|
wcschr-evex \
|
|
wcschr-sse2 \
|
|
wcscmp-avx2 \
|
|
wcscmp-avx2-rtm \
|
|
wcscmp-evex \
|
|
wcscmp-sse2 \
|
|
wcscpy-c \
|
|
wcscpy-ssse3 \
|
|
wcslen-avx2 \
|
|
wcslen-avx2-rtm \
|
|
wcslen-evex \
|
|
wcslen-sse2 \
|
|
wcslen-sse4_1 \
|
|
wcsncmp-avx2 \
|
|
wcsncmp-avx2-rtm \
|
|
wcsncmp-evex \
|
|
wcsncmp-sse2 \
|
|
wcsnlen-avx2 \
|
|
wcsnlen-avx2-rtm \
|
|
wcsnlen-c \
|
|
wcsnlen-evex \
|
|
wcsnlen-sse4_1 \
|
|
wcsrchr-avx2 \
|
|
wcsrchr-avx2-rtm \
|
|
wcsrchr-evex \
|
|
wcsrchr-sse2 \
|
|
wmemchr-avx2 \
|
|
wmemchr-avx2-rtm \
|
|
wmemchr-evex \
|
|
wmemchr-evex-rtm \
|
|
wmemchr-sse2 \
|
|
wmemcmp-avx2-movbe \
|
|
wmemcmp-avx2-movbe-rtm \
|
|
wmemcmp-evex-movbe \
|
|
wmemcmp-sse2 \
|
|
# sysdep_routines
|
|
endif
|
|
|
|
ifeq ($(subdir),debug)
|
|
sysdep_routines += \
|
|
memcpy_chk-nonshared \
|
|
memmove_chk-nonshared \
|
|
mempcpy_chk-nonshared \
|
|
memset_chk-nonshared \
|
|
wmemset_chk-nonshared \
|
|
# sysdep_routines
|
|
endif
|