x86-64: Optimize wmemset with SSE2/AVX2/AVX512

The difference between memset and wmemset is byte vs int.  Add stubs
to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size:

SSE2 wmemset:
	shl    $0x2,%rdx
	movd   %esi,%xmm0
	mov    %rdi,%rax
	pshufd $0x0,%xmm0,%xmm0
	jmp	entry_from_wmemset

SSE2 memset:
	movd   %esi,%xmm0
	mov    %rdi,%rax
	punpcklbw %xmm0,%xmm0
	punpcklwd %xmm0,%xmm0
	pshufd $0x0,%xmm0,%xmm0
entry_from_wmemset:

Since the ERMS versions of wmemset requires "rep stosl" instead of
"rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
are added.  The SSE2 wmemset is about 3X faster and the AVX2 wmemset
is about 6X faster on Haswell.

	* include/wchar.h (__wmemset_chk): New.
	* sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
	to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
	(WMEMSET_CHK_SYMBOL): Likewise.
	(WMEMSET_SYMBOL): Likewise.
	(__wmemset): Add hidden definition.
	(wmemset): Add weak hidden definition.
	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
	wmemset_chk-nonshared.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
	__wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
	__wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
	and __wmemset_chk_avx512_unaligned.
	* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
	(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
	(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
	(WMEMSET_SYMBOL): Likewise.
	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
	(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
	(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
	(WMEMSET_SYMBOL): Likewise.
	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
	(WMEMSET_CHK_SYMBOL): New.
	(WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
	(WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
	* sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
	(libc_hidden_builtin_def): Also define __GI_wmemset and
	__GI___wmemset.
	(weak_alias): New.
	* sysdeps/x86_64/multiarch/wmemset.c: New file.
	* sysdeps/x86_64/multiarch/wmemset.h: Likewise.
	* sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S: Likewise.
	* sysdeps/x86_64/multiarch/wmemset_chk.c: Likewise.
	* sysdeps/x86_64/wmemset.c: Likewise.
	* sysdeps/x86_64/wmemset_chk.c: Likewise.
This commit is contained in:
H.J. Lu 2017-06-05 11:09:48 -07:00
parent 9cd30491dd
commit ef9c4cb6c7
15 changed files with 295 additions and 9 deletions

View File

@ -1,3 +1,45 @@
2017-06-05 H.J. Lu <hongjiu.lu@intel.com>
* include/wchar.h (__wmemset_chk): New.
* sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
(WMEMSET_CHK_SYMBOL): Likewise.
(WMEMSET_SYMBOL): Likewise.
(__wmemset): Add hidden definition.
(wmemset): Add weak hidden definition.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
wmemset_chk-nonshared.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
__wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
__wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
and __wmemset_chk_avx512_unaligned.
* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
(WMEMSET_SYMBOL): Likewise.
* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
(WMEMSET_SYMBOL): Likewise.
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
(WMEMSET_CHK_SYMBOL): New.
(WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
(WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
* sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
(libc_hidden_builtin_def): Also define __GI_wmemset and
__GI___wmemset.
(weak_alias): New.
* sysdeps/x86_64/multiarch/wmemset.c: New file.
* sysdeps/x86_64/multiarch/wmemset.h: Likewise.
* sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S: Likewise.
* sysdeps/x86_64/multiarch/wmemset_chk.c: Likewise.
* sysdeps/x86_64/wmemset.c: Likewise.
* sysdeps/x86_64/wmemset_chk.c: Likewise.
2017-06-05 Adhemerval Zanella <adhemerval.zanella@linaro.org>
H.J. Lu <hongjiu.lu@intel.com>

View File

@ -157,6 +157,9 @@ extern wchar_t *__wmemmove (wchar_t *__s1, const wchar_t *__s2,
extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
__attribute_pure__;
extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
size_t __ns) __THROW;
extern int __vfwscanf (__FILE *__restrict __s,
const wchar_t *__restrict __format,
__gnuc_va_list __arg)

View File

@ -26,13 +26,18 @@
#define VMOVU movdqu
#define VMOVA movdqa
#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
movq r, %rax; \
punpcklbw %xmm0, %xmm0; \
punpcklwd %xmm0, %xmm0; \
pshufd $0, %xmm0, %xmm0
#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
movq r, %rax; \
pshufd $0, %xmm0, %xmm0
#define SECTION(p) p
#ifndef MEMSET_SYMBOL
@ -40,10 +45,21 @@
# define MEMSET_SYMBOL(p,s) memset
#endif
#ifndef WMEMSET_SYMBOL
# define WMEMSET_CHK_SYMBOL(p,s) p
# define WMEMSET_SYMBOL(p,s) __wmemset
#endif
#include "multiarch/memset-vec-unaligned-erms.S"
libc_hidden_builtin_def (memset)
#if IS_IN (libc)
libc_hidden_def (__wmemset)
weak_alias (__wmemset, wmemset)
libc_hidden_weak (wmemset)
#endif
#if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
.section .gnu.warning.__memset_zero_constant_len_parameter

View File

@ -32,3 +32,7 @@ endif
ifeq ($(subdir),wcsmbs)
sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
endif
ifeq ($(subdir),debug)
sysdep_routines += wmemset_chk-nonshared
endif

View File

@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
/* Support sysdeps/x86_64/multiarch/wmemset.S. */
IFUNC_IMPL (i, name, wmemset,
IFUNC_IMPL_ADD (array, i, wmemset, 1,
__wmemset_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
HAS_ARCH_FEATURE (AVX2_Usable),
__wmemset_avx2_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
HAS_ARCH_FEATURE (AVX512F_Usable),
__wmemset_avx512_unaligned))
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
IFUNC_IMPL (i, name, __memcpy_chk,
@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
__strncmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
/* Support sysdeps/x86_64/multiarch/wmemset_chk.S. */
IFUNC_IMPL (i, name, __wmemset_chk,
IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
__wmemset_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
HAS_ARCH_FEATURE (AVX2_Usable),
__wmemset_chk_avx2_unaligned)
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__wmemset_chk_avx512_unaligned))
#endif
return i;

View File

@ -4,13 +4,19 @@
# define VMOVU vmovdqu
# define VMOVA vmovdqa
# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax; \
vpbroadcastb %xmm0, %ymm0
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax; \
vpbroadcastd %xmm0, %ymm0
# define SECTION(p) p##.avx
# define MEMSET_SYMBOL(p,s) p##_avx2_##s
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# include "memset-vec-unaligned-erms.S"
#endif

View File

@ -4,14 +4,21 @@
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax; \
vpbroadcastb %xmm0, %xmm0; \
vpbroadcastq %xmm0, %zmm0
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax; \
vpbroadcastd %xmm0, %xmm0; \
vpbroadcastq %xmm0, %zmm0
# define SECTION(p) p##.avx512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
# include "memset-vec-unaligned-erms.S"
#endif

View File

@ -30,6 +30,10 @@
# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
#endif
#ifndef WMEMSET_CHK_SYMBOL
# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
#endif
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
@ -79,6 +83,21 @@ END (__bzero)
weak_alias (__bzero, bzero)
#endif
#if IS_IN (libc)
# if defined SHARED
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
# endif
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
shlq $2, %rdx
WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
jmp L(entry_from_bzero)
END (WMEMSET_SYMBOL (__wmemset, unaligned))
#endif
#if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
cmpq %rdx, %rcx
@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
#endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
L(memset_entry):
VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
L(entry_from_bzero):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
# endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
cmpq $(VEC_SIZE * 2), %rdx

View File

@ -58,6 +58,7 @@ END(memset)
#if IS_IN (libc)
# define MEMSET_SYMBOL(p,s) p##_sse2_##s
# define WMEMSET_SYMBOL(p,s) p##_sse2_##s
# ifdef SHARED
# undef libc_hidden_builtin_def
@ -65,9 +66,15 @@ END(memset)
The speedup we get from using SSE2 instructions is likely eaten away
by the indirect call in the PLT. */
# define libc_hidden_builtin_def(name) \
.globl __GI_memset; __GI_memset = __memset_sse2_unaligned
.globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
.globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
.globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
# endif
# undef weak_alias
# define weak_alias(original, alias) \
.weak bzero; bzero = __bzero
# undef strong_alias
# define strong_alias(original, alias)
#endif

View File

@ -0,0 +1,33 @@
/* Multiple versions of wmemset.
All versions must be listed in ifunc-impl-list.c.
Copyright (C) 2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* Define multiple versions only for the definition in libc. */
#if IS_IN (libc)
# define wmemset __redirect_wmemset
# define __wmemset __redirect___wmemset
# include <wchar.h>
# undef wmemset
# undef __wmemset
# define SYMBOL_NAME wmemset
# include "wmemset.h"
libc_ifunc_redirected (__redirect_wmemset, __wmemset, IFUNC_SELECTOR ());
weak_alias (__wmemset, wmemset)
#endif

View File

@ -0,0 +1,42 @@
/* Common definition for wmemset/wmemset_chk ifunc selections.
Copyright (C) 2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
return OPTIMIZE (avx512_unaligned);
else
return OPTIMIZE (avx2_unaligned);
}
return OPTIMIZE (sse2_unaligned);
}

View File

@ -0,0 +1,21 @@
/* Non-shared version of wmemset_chk for x86-64.
Copyright (C) 2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#if IS_IN (libc) && !defined SHARED
# include "../wmemset_chk.S"
#endif

View File

@ -0,0 +1,31 @@
/* Multiple versions of wmemset_chk.
All versions must be listed in ifunc-impl-list.c.
Copyright (C) 2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* Define multiple versions only for the definition in libc.so. */
#if IS_IN (libc) && defined SHARED
# define __wmemset_chk __redirect_wmemset_chk
# include <wchar.h>
# undef __wmemset_chk
# define SYMBOL_NAME wmemset_chk
# include "wmemset.h"
libc_ifunc_redirected (__redirect_wmemset_chk, __wmemset_chk,
IFUNC_SELECTOR ());
#endif

1
sysdeps/x86_64/wmemset.S Normal file
View File

@ -0,0 +1 @@
/* Implemented in memset.S. */

View File

@ -0,0 +1,33 @@
/* Checking wmemset for x86-64.
Copyright (C) 2004-2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "asm-syntax.h"
#ifndef SHARED
/* For libc.so this is defined in wmemset.S.
For libc.a, this is a separate source to avoid
wmemset bringing in __chk_fail and all routines
it calls. */
.text
ENTRY (__wmemset_chk)
cmpq %rdx, %rcx
jb __chk_fail
jmp wmemset
END (__wmemset_chk)
#endif