glibc/include/wchar.h
H.J. Lu ef9c4cb6c7 x86-64: Optimize wmemset with SSE2/AVX2/AVX512
The difference between memset and wmemset is byte vs int.  Add stubs
to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size:

SSE2 wmemset:
	shl    $0x2,%rdx
	movd   %esi,%xmm0
	mov    %rdi,%rax
	pshufd $0x0,%xmm0,%xmm0
	jmp	entry_from_wmemset

SSE2 memset:
	movd   %esi,%xmm0
	mov    %rdi,%rax
	punpcklbw %xmm0,%xmm0
	punpcklwd %xmm0,%xmm0
	pshufd $0x0,%xmm0,%xmm0
entry_from_wmemset:

Since the ERMS versions of wmemset requires "rep stosl" instead of
"rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
are added.  The SSE2 wmemset is about 3X faster and the AVX2 wmemset
is about 6X faster on Haswell.

	* include/wchar.h (__wmemset_chk): New.
	* sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
	to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
	(WMEMSET_CHK_SYMBOL): Likewise.
	(WMEMSET_SYMBOL): Likewise.
	(__wmemset): Add hidden definition.
	(wmemset): Add weak hidden definition.
	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
	wmemset_chk-nonshared.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
	__wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
	__wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
	and __wmemset_chk_avx512_unaligned.
	* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
	(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
	(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
	(WMEMSET_SYMBOL): Likewise.
	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
	(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
	(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
	(WMEMSET_SYMBOL): Likewise.
	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
	(WMEMSET_CHK_SYMBOL): New.
	(WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
	(WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
	* sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
	(libc_hidden_builtin_def): Also define __GI_wmemset and
	__GI___wmemset.
	(weak_alias): New.
	* sysdeps/x86_64/multiarch/wmemset.c: New file.
	* sysdeps/x86_64/multiarch/wmemset.h: Likewise.
	* sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S: Likewise.
	* sysdeps/x86_64/multiarch/wmemset_chk.c: Likewise.
	* sysdeps/x86_64/wmemset.c: Likewise.
	* sysdeps/x86_64/wmemset_chk.c: Likewise.
2017-06-05 11:09:59 -07:00

229 lines
8.6 KiB
C

#ifndef _WCHAR_H
#include <wcsmbs/wchar.h>
# ifndef _ISOMAC
# ifdef _WCHAR_H
extern __typeof (wcscasecmp_l) __wcscasecmp_l;
extern __typeof (wcsncasecmp_l) __wcsncasecmp_l;
extern __typeof (wcscoll_l) __wcscoll_l;
extern __typeof (wcsxfrm_l) __wcsxfrm_l;
extern __typeof (wcstol_l) __wcstol_l;
extern __typeof (wcstoul_l) __wcstoul_l;
extern __typeof (wcstoll_l) __wcstoll_l;
extern __typeof (wcstoull_l) __wcstoull_l;
extern __typeof (wcstod_l) __wcstod_l;
extern __typeof (wcstof_l) __wcstof_l;
extern __typeof (wcstold_l) __wcstold_l;
extern __typeof (wcsftime_l) __wcsftime_l;
libc_hidden_proto (__wcstol_l)
libc_hidden_proto (__wcstoul_l)
libc_hidden_proto (__wcstoll_l)
libc_hidden_proto (__wcstoull_l)
libc_hidden_proto (__wcstod_l)
libc_hidden_proto (__wcstof_l)
libc_hidden_proto (__wcstold_l)
libc_hidden_proto (__wcsftime_l)
extern double __wcstod_internal (const wchar_t *__restrict __nptr,
wchar_t **__restrict __endptr, int __group)
__THROW;
extern float __wcstof_internal (const wchar_t *__restrict __nptr,
wchar_t **__restrict __endptr, int __group)
__THROW;
extern long double __wcstold_internal (const wchar_t *__restrict __nptr,
wchar_t **__restrict __endptr,
int __group) __THROW;
extern long int __wcstol_internal (const wchar_t *__restrict __nptr,
wchar_t **__restrict __endptr,
int __base, int __group) __THROW;
extern unsigned long int __wcstoul_internal (const wchar_t *__restrict __npt,
wchar_t **__restrict __endptr,
int __base, int __group) __THROW;
__extension__
extern long long int __wcstoll_internal (const wchar_t *__restrict __nptr,
wchar_t **__restrict __endptr,
int __base, int __group) __THROW;
__extension__
extern unsigned long long int __wcstoull_internal (const wchar_t *
__restrict __nptr,
wchar_t **
__restrict __endptr,
int __base,
int __group) __THROW;
extern unsigned long long int ____wcstoull_l_internal (const wchar_t *,
wchar_t **, int, int,
__locale_t);
libc_hidden_proto (__wcstof_internal)
libc_hidden_proto (__wcstod_internal)
libc_hidden_proto (__wcstold_internal)
libc_hidden_proto (__wcstol_internal)
libc_hidden_proto (__wcstoll_internal)
libc_hidden_proto (__wcstoul_internal)
libc_hidden_proto (__wcstoull_internal)
libc_hidden_proto (wcstof)
libc_hidden_proto (wcstod)
libc_hidden_proto (wcstold)
libc_hidden_proto (wcstol)
libc_hidden_proto (wcstoll)
libc_hidden_proto (wcstoul)
libc_hidden_proto (wcstoull)
libc_hidden_proto (__wcscasecmp_l)
libc_hidden_proto (__wcsncasecmp_l)
libc_hidden_proto (__wcscoll_l)
libc_hidden_proto (__wcsxfrm_l)
libc_hidden_proto (fputws_unlocked)
libc_hidden_proto (putwc_unlocked)
libc_hidden_proto (putwc)
libc_hidden_proto (vswscanf)
libc_hidden_proto (mbrtowc)
libc_hidden_proto (wcrtomb)
extern int __wcscmp (const wchar_t *__s1, const wchar_t *__s2)
__THROW __attribute_pure__;
libc_hidden_proto (__wcscmp)
libc_hidden_proto (wcsftime)
libc_hidden_proto (wcsspn)
libc_hidden_proto (wcschr)
/* The C++ overloading of wcschr means we have to repeat the type to
declare __wcschr instead of using typeof, to avoid errors in C++
tests; in addition, __THROW cannot be used with a function type
from typeof in C++. The same applies to __wmemchr and, as regards
__THROW, to __wcscmp and __wcscoll. */
extern wchar_t *__wcschr (const wchar_t *__wcs, wchar_t __wc)
__THROW __attribute_pure__;
libc_hidden_proto (__wcschr)
extern int __wcscoll (const wchar_t *__s1, const wchar_t *__s2) __THROW;
libc_hidden_proto (__wcscoll)
libc_hidden_proto (wcspbrk)
extern __typeof (wmemset) __wmemset;
extern wchar_t *__wmemchr (const wchar_t *__s, wchar_t __c, size_t __n)
__THROW __attribute_pure__;
libc_hidden_proto (wmemchr)
libc_hidden_proto (__wmemchr)
libc_hidden_proto (wmemset)
libc_hidden_proto (__wmemset)
/* Now define the internal interfaces. */
extern int __wcscasecmp (const wchar_t *__s1, const wchar_t *__s2)
__attribute_pure__;
extern int __wcsncasecmp (const wchar_t *__s1, const wchar_t *__s2,
size_t __n)
__attribute_pure__;
extern size_t __wcslen (const wchar_t *__s) __attribute_pure__;
extern size_t __wcsnlen (const wchar_t *__s, size_t __maxlen)
__attribute_pure__;
extern wchar_t *__wcscat (wchar_t *dest, const wchar_t *src);
extern wint_t __btowc (int __c);
extern int __mbsinit (const __mbstate_t *__ps);
extern size_t __mbrtowc (wchar_t *__restrict __pwc,
const char *__restrict __s, size_t __n,
__mbstate_t *__restrict __p);
libc_hidden_proto (__mbrtowc)
libc_hidden_proto (__mbrlen)
extern size_t __wcrtomb (char *__restrict __s, wchar_t __wc,
__mbstate_t *__restrict __ps);
extern size_t __mbsrtowcs (wchar_t *__restrict __dst,
const char **__restrict __src,
size_t __len, __mbstate_t *__restrict __ps);
extern size_t __wcsrtombs (char *__restrict __dst,
const wchar_t **__restrict __src,
size_t __len, __mbstate_t *__restrict __ps);
extern size_t __mbsnrtowcs (wchar_t *__restrict __dst,
const char **__restrict __src, size_t __nmc,
size_t __len, __mbstate_t *__restrict __ps);
extern size_t __wcsnrtombs (char *__restrict __dst,
const wchar_t **__restrict __src,
size_t __nwc, size_t __len,
__mbstate_t *__restrict __ps);
extern wchar_t *__wcsncpy (wchar_t *__restrict __dest,
const wchar_t *__restrict __src, size_t __n);
extern wchar_t *__wcpcpy (wchar_t *__dest, const wchar_t *__src);
extern wchar_t *__wcpncpy (wchar_t *__dest, const wchar_t *__src,
size_t __n);
extern wchar_t *__wmemcpy (wchar_t *__s1, const wchar_t *s2,
size_t __n);
extern wchar_t *__wmempcpy (wchar_t *__restrict __s1,
const wchar_t *__restrict __s2,
size_t __n);
extern wchar_t *__wmemmove (wchar_t *__s1, const wchar_t *__s2,
size_t __n);
extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
__attribute_pure__;
extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
size_t __ns) __THROW;
extern int __vfwscanf (__FILE *__restrict __s,
const wchar_t *__restrict __format,
__gnuc_va_list __arg)
/* __attribute__ ((__format__ (__wscanf__, 2, 0)) */;
extern int __vswprintf (wchar_t *__restrict __s, size_t __n,
const wchar_t *__restrict __format,
__gnuc_va_list __arg)
/* __attribute__ ((__format__ (__wprintf__, 3, 0))) */;
extern int __fwprintf (__FILE *__restrict __s,
const wchar_t *__restrict __format, ...)
/* __attribute__ ((__format__ (__wprintf__, 2, 3))) */;
extern int __vfwprintf (__FILE *__restrict __s,
const wchar_t *__restrict __format,
__gnuc_va_list __arg)
/* __attribute__ ((__format__ (__wprintf__, 2, 0))) */;
extern int __vfwprintf_chk (FILE *__restrict __s, int __flag,
const wchar_t *__restrict __format,
__gnuc_va_list __arg)
/* __attribute__ ((__format__ (__wprintf__, 3, 0))) */;
extern int __vswprintf_chk (wchar_t *__restrict __s, size_t __n,
int __flag, size_t __s_len,
const wchar_t *__restrict __format,
__gnuc_va_list __arg)
/* __attribute__ ((__format__ (__wprintf__, 5, 0))) */;
libc_hidden_proto (__vfwprintf_chk)
libc_hidden_proto (__vswprintf_chk)
extern int __isoc99_fwscanf (__FILE *__restrict __stream,
const wchar_t *__restrict __format, ...);
extern int __isoc99_wscanf (const wchar_t *__restrict __format, ...);
extern int __isoc99_swscanf (const wchar_t *__restrict __s,
const wchar_t *__restrict __format, ...)
__THROW;
extern int __isoc99_vfwscanf (__FILE *__restrict __s,
const wchar_t *__restrict __format,
__gnuc_va_list __arg);
extern int __isoc99_vwscanf (const wchar_t *__restrict __format,
__gnuc_va_list __arg);
extern int __isoc99_vswscanf (const wchar_t *__restrict __s,
const wchar_t *__restrict __format,
__gnuc_va_list __arg) __THROW;
extern int __vswscanf (const wchar_t *__restrict __s,
const wchar_t *__restrict __format,
__gnuc_va_list __arg) __THROW;
libc_hidden_proto (__isoc99_vswscanf)
libc_hidden_proto (__vswscanf)
libc_hidden_proto (__isoc99_vfwscanf)
/* Internal functions. */
extern size_t __mbsrtowcs_l (wchar_t *dst, const char **src, size_t len,
mbstate_t *ps, __locale_t l) attribute_hidden;
/* Special version. We know that all uses of mbsinit inside the libc
have a non-NULL parameter. And certainly we can access the
internals of the data structure directly. */
# define mbsinit(state) ((state)->__count == 0)
# define __mbsinit(state) ((state)->__count == 0)
# endif
# endif
#endif
/* Undefine all __need_* constants in case we are included to get those
constants but the whole file was already read. */
#undef __need_mbstate_t
#undef __need_wint_t