mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-23 11:20:07 +00:00
Add x86-64 memset with unaligned store and rep stosb
Implement x86-64 memset with unaligned store and rep movsb. Support 16-byte, 32-byte and 64-byte vector register sizes. A single file provides 2 implementations of memset, one with rep stosb and the other without rep stosb. They share the same codes when size is between 2 times of vector register size and REP_STOSB_THRESHOLD which defaults to 2KB. Key features: 1. Use overlapping store to avoid branch. 2. For size <= 4 times of vector register size, fully unroll the loop. 3. For size > 4 times of vector register size, store 4 times of vector register size at a time. [BZ #19881] * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and memset-avx512-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned, __memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned, __memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned, __memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned, __memset_sse2_unaligned_erms, __memset_erms, __memset_avx2_unaligned, __memset_avx2_unaligned_erms, __memset_avx512_unaligned_erms and __memset_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New file. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
This commit is contained in:
parent
88b57b8ed4
commit
830566307f
23
ChangeLog
23
ChangeLog
@ -1,3 +1,26 @@
|
||||
2016-03-31 H.J. Lu <hongjiu.lu@intel.com>
|
||||
|
||||
[BZ #19881]
|
||||
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
|
||||
memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and
|
||||
memset-avx512-unaligned-erms.
|
||||
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
(__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned,
|
||||
__memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned,
|
||||
__memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned,
|
||||
__memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned,
|
||||
__memset_sse2_unaligned_erms, __memset_erms,
|
||||
__memset_avx2_unaligned, __memset_avx2_unaligned_erms,
|
||||
__memset_avx512_unaligned_erms and __memset_avx512_unaligned.
|
||||
* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New
|
||||
file.
|
||||
* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
|
||||
Likewise.
|
||||
* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S:
|
||||
Likewise.
|
||||
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:
|
||||
Likewise.
|
||||
|
||||
2016-03-31 H.J. Lu <hongjiu.lu@intel.com>
|
||||
|
||||
[BZ #19776]
|
||||
|
@ -23,7 +23,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
|
||||
memset-avx512-no-vzeroupper \
|
||||
memmove-sse2-unaligned-erms \
|
||||
memmove-avx-unaligned-erms \
|
||||
memmove-avx512-unaligned-erms
|
||||
memmove-avx512-unaligned-erms \
|
||||
memset-sse2-unaligned-erms \
|
||||
memset-avx2-unaligned-erms \
|
||||
memset-avx512-unaligned-erms
|
||||
CFLAGS-varshift.c += -msse4
|
||||
CFLAGS-strcspn-c.c += -msse4
|
||||
CFLAGS-strpbrk-c.c += -msse4
|
||||
|
@ -118,10 +118,26 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL (i, name, __memset_chk,
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
|
||||
__memset_chk_sse2)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
|
||||
__memset_chk_sse2_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
|
||||
__memset_chk_sse2_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__memset_chk_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__memset_chk_avx2_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__memset_chk_avx2_unaligned_erms)
|
||||
#ifdef HAVE_AVX512_ASM_SUPPORT
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
HAS_ARCH_FEATURE (AVX512F_Usable),
|
||||
__memset_chk_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
HAS_ARCH_FEATURE (AVX512F_Usable),
|
||||
__memset_chk_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
HAS_ARCH_FEATURE (AVX512F_Usable),
|
||||
__memset_chk_avx512_no_vzeroupper)
|
||||
@ -131,10 +147,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/x86_64/multiarch/memset.S. */
|
||||
IFUNC_IMPL (i, name, memset,
|
||||
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
|
||||
IFUNC_IMPL_ADD (array, i, memset, 1,
|
||||
__memset_sse2_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memset, 1,
|
||||
__memset_sse2_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__memset_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__memset_avx2_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__memset_avx2_unaligned_erms)
|
||||
#ifdef HAVE_AVX512_ASM_SUPPORT
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
HAS_ARCH_FEATURE (AVX512F_Usable),
|
||||
__memset_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
HAS_ARCH_FEATURE (AVX512F_Usable),
|
||||
__memset_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
HAS_ARCH_FEATURE (AVX512F_Usable),
|
||||
__memset_avx512_no_vzeroupper)
|
||||
|
14
sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
Normal file
14
sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
Normal file
@ -0,0 +1,14 @@
|
||||
#define VEC_SIZE 32
|
||||
#define VEC(i) ymm##i
|
||||
#define VMOVU vmovdqu
|
||||
#define VMOVA vmovdqa
|
||||
|
||||
#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
vmovd d, %xmm0; \
|
||||
movq r, %rax; \
|
||||
vpbroadcastb %xmm0, %ymm0
|
||||
|
||||
#define SECTION(p) p##.avx
|
||||
#define MEMSET_SYMBOL(p,s) p##_avx2_##s
|
||||
|
||||
#include "memset-vec-unaligned-erms.S"
|
17
sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
Normal file
17
sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
Normal file
@ -0,0 +1,17 @@
|
||||
#ifdef HAVE_AVX512_ASM_SUPPORT
|
||||
# define VEC_SIZE 64
|
||||
# define VEC(i) zmm##i
|
||||
# define VMOVU vmovdqu64
|
||||
# define VMOVA vmovdqa64
|
||||
|
||||
# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
vmovd d, %xmm0; \
|
||||
movq r, %rax; \
|
||||
vpbroadcastb %xmm0, %xmm0; \
|
||||
vpbroadcastq %xmm0, %zmm0
|
||||
|
||||
# define SECTION(p) p##.avx512
|
||||
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
||||
|
||||
# include "memset-vec-unaligned-erms.S"
|
||||
#endif
|
16
sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
Normal file
16
sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
Normal file
@ -0,0 +1,16 @@
|
||||
#define VEC_SIZE 16
|
||||
#define VEC(i) xmm##i
|
||||
#define VMOVU movdqu
|
||||
#define VMOVA movdqa
|
||||
|
||||
#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
movq r, %rax; \
|
||||
punpcklbw %xmm0, %xmm0; \
|
||||
punpcklwd %xmm0, %xmm0; \
|
||||
pshufd $0, %xmm0, %xmm0
|
||||
|
||||
#define SECTION(p) p
|
||||
#define MEMSET_SYMBOL(p,s) p##_sse2_##s
|
||||
|
||||
#include "memset-vec-unaligned-erms.S"
|
251
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
Normal file
251
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
Normal file
@ -0,0 +1,251 @@
|
||||
/* memset/bzero with unaligned store and rep stosb
|
||||
Copyright (C) 2016 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
/* memset is implemented as:
|
||||
1. Use overlapping store to avoid branch.
|
||||
2. Force 32-bit displacement for branches to avoid long nop between
|
||||
instructions.
|
||||
3. If size is less than VEC, use integer register stores.
|
||||
4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
|
||||
5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
|
||||
6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
|
||||
4 VEC stores and store 4 * VEC at a time until done. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
#ifndef VZEROUPPER
|
||||
# if VEC_SIZE > 16
|
||||
# define VZEROUPPER vzeroupper
|
||||
# else
|
||||
# define VZEROUPPER
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef VZEROUPPER_SHORT_RETURN
|
||||
# if VEC_SIZE > 16
|
||||
# define VZEROUPPER_SHORT_RETURN vzeroupper
|
||||
# else
|
||||
# define VZEROUPPER_SHORT_RETURN rep
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef MOVQ
|
||||
# if VEC_SIZE > 16
|
||||
# define MOVQ vmovq
|
||||
# else
|
||||
# define MOVQ movq
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
|
||||
up REP STOSB operation, REP STOSB isn't faster on short data. The
|
||||
memset micro benchmark in glibc shows that 2KB is the approximate
|
||||
value above which REP STOSB becomes faster on processors with
|
||||
Enhanced REP STOSB. Since the stored value is fixed, larger register
|
||||
size has minimal impact on threshold. */
|
||||
#ifndef REP_STOSB_THRESHOLD
|
||||
# define REP_STOSB_THRESHOLD 2048
|
||||
#endif
|
||||
|
||||
#ifndef SECTION
|
||||
# error SECTION is not defined!
|
||||
#endif
|
||||
|
||||
#if !defined USE_MULTIARCH && IS_IN (libc)
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (__bzero)
|
||||
movq %rdi, %rax /* Set return value. */
|
||||
movq %rsi, %rdx /* Set n. */
|
||||
pxor %xmm0, %xmm0
|
||||
jmp L(entry_from_bzero)
|
||||
END (__bzero)
|
||||
weak_alias (__bzero, bzero)
|
||||
#endif
|
||||
|
||||
#if defined SHARED && IS_IN (libc)
|
||||
ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
|
||||
cmpq %rdx, %rcx
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
|
||||
#endif
|
||||
|
||||
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
||||
L(memset_entry):
|
||||
VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
L(entry_from_bzero):
|
||||
cmpq $VEC_SIZE, %rdx
|
||||
jb L(less_vec)
|
||||
cmpq $(VEC_SIZE * 2), %rdx
|
||||
ja L(more_2x_vec)
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
END (MEMSET_SYMBOL (__memset, unaligned))
|
||||
|
||||
#if VEC_SIZE == 16
|
||||
/* Only used to measure performance of REP STOSB. */
|
||||
ENTRY (__memset_erms)
|
||||
#else
|
||||
/* Provide a symbol to debugger. */
|
||||
ENTRY (MEMSET_SYMBOL (__memset, erms))
|
||||
#endif
|
||||
L(stosb):
|
||||
movq %rdx, %rcx
|
||||
movzbl %sil, %eax
|
||||
movq %rdi, %rdx
|
||||
rep stosb
|
||||
movq %rdx, %rax
|
||||
ret
|
||||
#if VEC_SIZE == 16
|
||||
END (__memset_erms)
|
||||
#else
|
||||
END (MEMSET_SYMBOL (__memset, erms))
|
||||
#endif
|
||||
|
||||
#if defined SHARED && IS_IN (libc)
|
||||
ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
|
||||
cmpq %rdx, %rcx
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
|
||||
#endif
|
||||
|
||||
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
cmpq $VEC_SIZE, %rdx
|
||||
jb L(less_vec)
|
||||
cmpq $(VEC_SIZE * 2), %rdx
|
||||
ja L(stosb_more_2x_vec)
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(stosb_more_2x_vec):
|
||||
cmpq $REP_STOSB_THRESHOLD, %rdx
|
||||
/* Force 32-bit displacement to avoid long nop between
|
||||
instructions. */
|
||||
ja.d32 L(stosb)
|
||||
.p2align 4
|
||||
L(more_2x_vec):
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
ja L(loop_start)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(0), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
L(return):
|
||||
VZEROUPPER
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(loop_start):
|
||||
leaq (VEC_SIZE * 4)(%rdi), %rcx
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
andq $-(VEC_SIZE * 4), %rcx
|
||||
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(0), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
|
||||
VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
|
||||
VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
|
||||
addq %rdi, %rdx
|
||||
andq $-(VEC_SIZE * 4), %rdx
|
||||
cmpq %rdx, %rcx
|
||||
# if VEC_SIZE == 32 || VEC_SIZE == 64
|
||||
/* Force 32-bit displacement to avoid long nop between
|
||||
instructions. */
|
||||
je.d32 L(return)
|
||||
# else
|
||||
je L(return)
|
||||
# endif
|
||||
.p2align 4
|
||||
L(loop):
|
||||
VMOVA %VEC(0), (%rcx)
|
||||
VMOVA %VEC(0), VEC_SIZE(%rcx)
|
||||
VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
|
||||
VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
|
||||
addq $(VEC_SIZE * 4), %rcx
|
||||
cmpq %rcx, %rdx
|
||||
jne L(loop)
|
||||
VZEROUPPER_SHORT_RETURN
|
||||
ret
|
||||
L(less_vec):
|
||||
/* Less than 1 VEC. */
|
||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
# error Unsupported VEC_SIZE!
|
||||
# endif
|
||||
# if VEC_SIZE > 32
|
||||
cmpb $32, %dl
|
||||
jae L(between_32_63)
|
||||
# endif
|
||||
# if VEC_SIZE > 16
|
||||
cmpb $16, %dl
|
||||
jae L(between_16_31)
|
||||
# endif
|
||||
MOVQ %xmm0, %rcx
|
||||
cmpb $8, %dl
|
||||
jae L(between_8_15)
|
||||
cmpb $4, %dl
|
||||
jae L(between_4_7)
|
||||
cmpb $1, %dl
|
||||
ja L(between_2_3)
|
||||
jb 1f
|
||||
movb %cl, (%rdi)
|
||||
1:
|
||||
VZEROUPPER
|
||||
ret
|
||||
# if VEC_SIZE > 32
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
L(between_32_63):
|
||||
vmovdqu %ymm0, -32(%rdi,%rdx)
|
||||
vmovdqu %ymm0, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
# endif
|
||||
# if VEC_SIZE > 16
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
L(between_16_31):
|
||||
vmovdqu %xmm0, -16(%rdi,%rdx)
|
||||
vmovdqu %xmm0, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
# endif
|
||||
/* From 8 to 15. No branch when size == 8. */
|
||||
L(between_8_15):
|
||||
movq %rcx, -8(%rdi,%rdx)
|
||||
movq %rcx, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
L(between_4_7):
|
||||
/* From 4 to 7. No branch when size == 4. */
|
||||
movl %ecx, -4(%rdi,%rdx)
|
||||
movl %ecx, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
L(between_2_3):
|
||||
/* From 2 to 3. No branch when size == 2. */
|
||||
movw %cx, -2(%rdi,%rdx)
|
||||
movw %cx, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
Loading…
Reference in New Issue
Block a user