2009-03-13 23:53:18 +00:00
|
|
|
ifeq ($(subdir),csu)
|
2012-05-17 03:14:24 +00:00
|
|
|
tests += test-multiarch
|
2009-03-13 23:53:18 +00:00
|
|
|
endif
|
2009-06-23 03:38:41 +00:00
|
|
|
|
|
|
|
ifeq ($(subdir),string)
|
2011-06-24 19:14:22 +00:00
|
|
|
|
2013-09-03 14:21:38 +00:00
|
|
|
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
|
|
|
|
strcmp-sse2-unaligned strncmp-ssse3 \
|
2016-06-08 20:57:50 +00:00
|
|
|
memcmp-sse4 memcpy-ssse3 \
|
2016-03-28 20:15:59 +00:00
|
|
|
memmove-ssse3 \
|
2016-06-08 20:57:50 +00:00
|
|
|
memcpy-ssse3-back \
|
2016-03-28 20:13:36 +00:00
|
|
|
memmove-ssse3-back \
|
2016-01-15 21:49:45 +00:00
|
|
|
memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
|
2013-03-18 06:39:12 +00:00
|
|
|
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
|
2011-06-24 19:14:22 +00:00
|
|
|
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
|
|
|
|
strcpy-sse2-unaligned strncpy-sse2-unaligned \
|
2011-07-19 21:11:54 +00:00
|
|
|
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
|
|
|
|
strcat-sse2-unaligned strncat-sse2-unaligned \
|
2015-10-06 20:47:40 +00:00
|
|
|
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
|
2016-06-08 20:55:45 +00:00
|
|
|
strcspn-c strpbrk-c strspn-c varshift \
|
Add x86-64 memmove with unaligned load/store and rep movsb
Implement x86-64 memmove with unaligned load/store and rep movsb.
Support 16-byte, 32-byte and 64-byte vector register sizes. When
size <= 8 times of vector register size, there is no check for
address overlap bewteen source and destination. Since overhead for
overlap check is small when size > 8 times of vector register size,
memcpy is an alias of memmove.
A single file provides 2 implementations of memmove, one with rep movsb
and the other without rep movsb. They share the same codes when size is
between 2 times of vector register size and REP_MOVSB_THRESHOLD which
is 2KB for 16-byte vector register size and scaled up by large vector
register size.
Key features:
1. Use overlapping load and store to avoid branch.
2. For size <= 8 times of vector register size, load all sources into
registers and store them together.
3. If there is no address overlap bewteen source and destination, copy
from both ends with 4 times of vector register size at a time.
4. If address of destination > address of source, backward copy 8 times
of vector register size at a time.
5. Otherwise, forward copy 8 times of vector register size at a time.
6. Use rep movsb only for forward copy. Avoid slow backward rep movsb
by fallbacking to backward copy 8 times of vector register size at a
time.
7. Skip when address of destination == address of source.
[BZ #19776]
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and
memmove-avx512-unaligned-erms.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test
__memmove_chk_avx512_unaligned_2,
__memmove_chk_avx512_unaligned_erms,
__memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms,
__memmove_chk_sse2_unaligned_2,
__memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2,
__memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2,
__memmove_avx512_unaligned_erms, __memmove_erms,
__memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms,
__memcpy_chk_avx512_unaligned_2,
__memcpy_chk_avx512_unaligned_erms,
__memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms,
__memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms,
__memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms,
__memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms,
__memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms,
__memcpy_erms, __mempcpy_chk_avx512_unaligned_2,
__mempcpy_chk_avx512_unaligned_erms,
__mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms,
__mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms,
__mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms,
__mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms,
__mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and
__mempcpy_erms.
* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New
file.
* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
Likwise.
* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
Likwise.
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
Likwise.
2016-03-31 17:04:26 +00:00
|
|
|
memset-avx512-no-vzeroupper \
|
|
|
|
memmove-avx-unaligned-erms \
|
Add x86-64 memset with unaligned store and rep stosb
Implement x86-64 memset with unaligned store and rep movsb. Support
16-byte, 32-byte and 64-byte vector register sizes. A single file
provides 2 implementations of memset, one with rep stosb and the other
without rep stosb. They share the same codes when size is between 2
times of vector register size and REP_STOSB_THRESHOLD which defaults
to 2KB.
Key features:
1. Use overlapping store to avoid branch.
2. For size <= 4 times of vector register size, fully unroll the loop.
3. For size > 4 times of vector register size, store 4 times of vector
register size at a time.
[BZ #19881]
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and
memset-avx512-unaligned-erms.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned,
__memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned,
__memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned,
__memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned,
__memset_sse2_unaligned_erms, __memset_erms,
__memset_avx2_unaligned, __memset_avx2_unaligned_erms,
__memset_avx512_unaligned_erms and __memset_avx512_unaligned.
* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New
file.
* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:
Likewise.
2016-03-31 17:05:51 +00:00
|
|
|
memmove-avx512-unaligned-erms \
|
|
|
|
memset-avx2-unaligned-erms \
|
|
|
|
memset-avx512-unaligned-erms
|
2010-08-25 19:13:08 +00:00
|
|
|
CFLAGS-varshift.c += -msse4
|
2009-07-03 09:48:56 +00:00
|
|
|
CFLAGS-strcspn-c.c += -msse4
|
|
|
|
CFLAGS-strpbrk-c.c += -msse4
|
|
|
|
CFLAGS-strspn-c.c += -msse4
|
2009-06-23 03:38:41 +00:00
|
|
|
endif
|
2011-12-17 19:39:23 +00:00
|
|
|
|
|
|
|
ifeq ($(subdir),wcsmbs)
|
|
|
|
sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
|
|
|
|
endif
|