mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-26 04:31:03 +00:00
ef9c4cb6c7
The difference between memset and wmemset is byte vs int. Add stubs to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size: SSE2 wmemset: shl $0x2,%rdx movd %esi,%xmm0 mov %rdi,%rax pshufd $0x0,%xmm0,%xmm0 jmp entry_from_wmemset SSE2 memset: movd %esi,%xmm0 mov %rdi,%rax punpcklbw %xmm0,%xmm0 punpcklwd %xmm0,%xmm0 pshufd $0x0,%xmm0,%xmm0 entry_from_wmemset: Since the ERMS versions of wmemset requires "rep stosl" instead of "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset is about 6X faster on Haswell. * include/wchar.h (__wmemset_chk): New. * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_CHK_SYMBOL): Likewise. (WMEMSET_SYMBOL): Likewise. (__wmemset): Add hidden definition. (wmemset): Add weak hidden definition. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add wmemset_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned, __wmemset_avx2_unaligned, __wmemset_avx512_unaligned, __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned and __wmemset_chk_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated. (WMEMSET_CHK_SYMBOL): New. (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise. (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise. * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New. (libc_hidden_builtin_def): Also define __GI_wmemset and __GI___wmemset. (weak_alias): New. * sysdeps/x86_64/multiarch/wmemset.c: New file. * sysdeps/x86_64/multiarch/wmemset.h: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk.c: Likewise. * sysdeps/x86_64/wmemset.c: Likewise. * sysdeps/x86_64/wmemset_chk.c: Likewise.
264 lines
6.6 KiB
ArmAsm
264 lines
6.6 KiB
ArmAsm
/* memset/bzero with unaligned store and rep stosb
|
|
Copyright (C) 2016-2017 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
/* memset is implemented as:
|
|
1. Use overlapping store to avoid branch.
|
|
2. If size is less than VEC, use integer register stores.
|
|
3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
|
|
4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
|
|
5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
|
|
4 VEC stores and store 4 * VEC at a time until done. */
|
|
|
|
#include <sysdep.h>
|
|
|
|
#ifndef MEMSET_CHK_SYMBOL
|
|
# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
|
|
#endif
|
|
|
|
#ifndef WMEMSET_CHK_SYMBOL
|
|
# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
|
|
#endif
|
|
|
|
#ifndef VZEROUPPER
|
|
# if VEC_SIZE > 16
|
|
# define VZEROUPPER vzeroupper
|
|
# else
|
|
# define VZEROUPPER
|
|
# endif
|
|
#endif
|
|
|
|
#ifndef VZEROUPPER_SHORT_RETURN
|
|
# if VEC_SIZE > 16
|
|
# define VZEROUPPER_SHORT_RETURN vzeroupper
|
|
# else
|
|
# define VZEROUPPER_SHORT_RETURN rep
|
|
# endif
|
|
#endif
|
|
|
|
#ifndef MOVQ
|
|
# if VEC_SIZE > 16
|
|
# define MOVQ vmovq
|
|
# else
|
|
# define MOVQ movq
|
|
# endif
|
|
#endif
|
|
|
|
/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
|
|
up REP STOSB operation, REP STOSB isn't faster on short data. The
|
|
memset micro benchmark in glibc shows that 2KB is the approximate
|
|
value above which REP STOSB becomes faster on processors with
|
|
Enhanced REP STOSB. Since the stored value is fixed, larger register
|
|
size has minimal impact on threshold. */
|
|
#ifndef REP_STOSB_THRESHOLD
|
|
# define REP_STOSB_THRESHOLD 2048
|
|
#endif
|
|
|
|
#ifndef SECTION
|
|
# error SECTION is not defined!
|
|
#endif
|
|
|
|
.section SECTION(.text),"ax",@progbits
|
|
#if VEC_SIZE == 16 && IS_IN (libc)
|
|
ENTRY (__bzero)
|
|
movq %rdi, %rax /* Set return value. */
|
|
movq %rsi, %rdx /* Set n. */
|
|
pxor %xmm0, %xmm0
|
|
jmp L(entry_from_bzero)
|
|
END (__bzero)
|
|
weak_alias (__bzero, bzero)
|
|
#endif
|
|
|
|
#if IS_IN (libc)
|
|
# if defined SHARED
|
|
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
|
cmpq %rdx, %rcx
|
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
|
# endif
|
|
|
|
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
shlq $2, %rdx
|
|
WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
jmp L(entry_from_bzero)
|
|
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
#endif
|
|
|
|
#if defined SHARED && IS_IN (libc)
|
|
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
|
cmpq %rdx, %rcx
|
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
|
#endif
|
|
|
|
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
|
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
L(entry_from_bzero):
|
|
cmpq $VEC_SIZE, %rdx
|
|
jb L(less_vec)
|
|
cmpq $(VEC_SIZE * 2), %rdx
|
|
ja L(more_2x_vec)
|
|
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
VMOVU %VEC(0), (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
|
END (MEMSET_SYMBOL (__memset, unaligned))
|
|
|
|
# if VEC_SIZE == 16
|
|
/* Only used to measure performance of REP STOSB. */
|
|
ENTRY (__memset_erms)
|
|
# else
|
|
/* Provide a symbol to debugger. */
|
|
ENTRY (MEMSET_SYMBOL (__memset, erms))
|
|
# endif
|
|
L(stosb):
|
|
/* Issue vzeroupper before rep stosb. */
|
|
VZEROUPPER
|
|
movq %rdx, %rcx
|
|
movzbl %sil, %eax
|
|
movq %rdi, %rdx
|
|
rep stosb
|
|
movq %rdx, %rax
|
|
ret
|
|
# if VEC_SIZE == 16
|
|
END (__memset_erms)
|
|
# else
|
|
END (MEMSET_SYMBOL (__memset, erms))
|
|
# endif
|
|
|
|
# if defined SHARED && IS_IN (libc)
|
|
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
|
cmpq %rdx, %rcx
|
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
|
# endif
|
|
|
|
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
cmpq $VEC_SIZE, %rdx
|
|
jb L(less_vec)
|
|
cmpq $(VEC_SIZE * 2), %rdx
|
|
ja L(stosb_more_2x_vec)
|
|
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
VMOVU %VEC(0), (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
|
|
L(stosb_more_2x_vec):
|
|
cmpq $REP_STOSB_THRESHOLD, %rdx
|
|
ja L(stosb)
|
|
#endif
|
|
L(more_2x_vec):
|
|
cmpq $(VEC_SIZE * 4), %rdx
|
|
ja L(loop_start)
|
|
VMOVU %VEC(0), (%rdi)
|
|
VMOVU %VEC(0), VEC_SIZE(%rdi)
|
|
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
|
L(return):
|
|
VZEROUPPER
|
|
ret
|
|
|
|
L(loop_start):
|
|
leaq (VEC_SIZE * 4)(%rdi), %rcx
|
|
VMOVU %VEC(0), (%rdi)
|
|
andq $-(VEC_SIZE * 4), %rcx
|
|
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
VMOVU %VEC(0), VEC_SIZE(%rdi)
|
|
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
|
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
|
|
VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
|
|
VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
|
|
VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
|
|
addq %rdi, %rdx
|
|
andq $-(VEC_SIZE * 4), %rdx
|
|
cmpq %rdx, %rcx
|
|
je L(return)
|
|
L(loop):
|
|
VMOVA %VEC(0), (%rcx)
|
|
VMOVA %VEC(0), VEC_SIZE(%rcx)
|
|
VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
|
|
VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
|
|
addq $(VEC_SIZE * 4), %rcx
|
|
cmpq %rcx, %rdx
|
|
jne L(loop)
|
|
VZEROUPPER_SHORT_RETURN
|
|
ret
|
|
L(less_vec):
|
|
/* Less than 1 VEC. */
|
|
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
|
# error Unsupported VEC_SIZE!
|
|
# endif
|
|
# if VEC_SIZE > 32
|
|
cmpb $32, %dl
|
|
jae L(between_32_63)
|
|
# endif
|
|
# if VEC_SIZE > 16
|
|
cmpb $16, %dl
|
|
jae L(between_16_31)
|
|
# endif
|
|
MOVQ %xmm0, %rcx
|
|
cmpb $8, %dl
|
|
jae L(between_8_15)
|
|
cmpb $4, %dl
|
|
jae L(between_4_7)
|
|
cmpb $1, %dl
|
|
ja L(between_2_3)
|
|
jb 1f
|
|
movb %cl, (%rdi)
|
|
1:
|
|
VZEROUPPER
|
|
ret
|
|
# if VEC_SIZE > 32
|
|
/* From 32 to 63. No branch when size == 32. */
|
|
L(between_32_63):
|
|
vmovdqu %ymm0, -32(%rdi,%rdx)
|
|
vmovdqu %ymm0, (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
# endif
|
|
# if VEC_SIZE > 16
|
|
/* From 16 to 31. No branch when size == 16. */
|
|
L(between_16_31):
|
|
vmovdqu %xmm0, -16(%rdi,%rdx)
|
|
vmovdqu %xmm0, (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
# endif
|
|
/* From 8 to 15. No branch when size == 8. */
|
|
L(between_8_15):
|
|
movq %rcx, -8(%rdi,%rdx)
|
|
movq %rcx, (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
L(between_4_7):
|
|
/* From 4 to 7. No branch when size == 4. */
|
|
movl %ecx, -4(%rdi,%rdx)
|
|
movl %ecx, (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
L(between_2_3):
|
|
/* From 2 to 3. No branch when size == 2. */
|
|
movw %cx, -2(%rdi,%rdx)
|
|
movw %cx, (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|