mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-07 10:00:07 +00:00
5a82c74822
Also, change sources.redhat.com to sourceware.org. This patch was automatically generated by running the following shell script, which uses GNU sed, and which avoids modifying files imported from upstream: sed -ri ' s,(http|ftp)(://(.*\.)?(gnu|fsf|sourceware)\.org($|[^.]|\.[^a-z])),https\2,g s,(http|ftp)(://(.*\.)?)sources\.redhat\.com($|[^.]|\.[^a-z]),https\2sourceware.org\4,g ' \ $(find $(git ls-files) -prune -type f \ ! -name '*.po' \ ! -name 'ChangeLog*' \ ! -path COPYING ! -path COPYING.LIB \ ! -path manual/fdl-1.3.texi ! -path manual/lgpl-2.1.texi \ ! -path manual/texinfo.tex ! -path scripts/config.guess \ ! -path scripts/config.sub ! -path scripts/install-sh \ ! -path scripts/mkinstalldirs ! -path scripts/move-if-change \ ! -path INSTALL ! -path locale/programs/charmap-kw.h \ ! -path po/libc.pot ! -path sysdeps/gnu/errlist.c \ ! '(' -name configure \ -execdir test -f configure.ac -o -f configure.in ';' ')' \ ! '(' -name preconfigure \ -execdir test -f preconfigure.ac ';' ')' \ -print) and then by running 'make dist-prepare' to regenerate files built from the altered files, and then executing the following to cleanup: chmod a+x sysdeps/unix/sysv/linux/riscv/configure # Omit irrelevant whitespace and comment-only changes, # perhaps from a slightly-different Autoconf version. git checkout -f \ sysdeps/csky/configure \ sysdeps/hppa/configure \ sysdeps/riscv/configure \ sysdeps/unix/sysv/linux/csky/configure # Omit changes that caused a pre-commit check to fail like this: # remote: *** error: sysdeps/powerpc/powerpc64/ppc-mcount.S: trailing lines git checkout -f \ sysdeps/powerpc/powerpc64/ppc-mcount.S \ sysdeps/unix/sysv/linux/s390/s390-64/syscall.S # Omit change that caused a pre-commit check to fail like this: # remote: *** error: sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S: last line does not end in newline git checkout -f sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S
283 lines
7.0 KiB
ArmAsm
283 lines
7.0 KiB
ArmAsm
/* memset/bzero with unaligned store and rep stosb
|
|
Copyright (C) 2016-2019 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
/* memset is implemented as:
|
|
1. Use overlapping store to avoid branch.
|
|
2. If size is less than VEC, use integer register stores.
|
|
3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
|
|
4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
|
|
5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
|
|
4 VEC stores and store 4 * VEC at a time until done. */
|
|
|
|
#include <sysdep.h>
|
|
|
|
#ifndef MEMSET_CHK_SYMBOL
|
|
# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
|
|
#endif
|
|
|
|
#ifndef WMEMSET_CHK_SYMBOL
|
|
# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
|
|
#endif
|
|
|
|
#ifndef VZEROUPPER
|
|
# if VEC_SIZE > 16
|
|
# define VZEROUPPER vzeroupper
|
|
# else
|
|
# define VZEROUPPER
|
|
# endif
|
|
#endif
|
|
|
|
#ifndef VZEROUPPER_SHORT_RETURN
|
|
# if VEC_SIZE > 16
|
|
# define VZEROUPPER_SHORT_RETURN vzeroupper
|
|
# else
|
|
# define VZEROUPPER_SHORT_RETURN rep
|
|
# endif
|
|
#endif
|
|
|
|
#ifndef MOVQ
|
|
# if VEC_SIZE > 16
|
|
# define MOVQ vmovq
|
|
# else
|
|
# define MOVQ movq
|
|
# endif
|
|
#endif
|
|
|
|
/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
|
|
up REP STOSB operation, REP STOSB isn't faster on short data. The
|
|
memset micro benchmark in glibc shows that 2KB is the approximate
|
|
value above which REP STOSB becomes faster on processors with
|
|
Enhanced REP STOSB. Since the stored value is fixed, larger register
|
|
size has minimal impact on threshold. */
|
|
#ifndef REP_STOSB_THRESHOLD
|
|
# define REP_STOSB_THRESHOLD 2048
|
|
#endif
|
|
|
|
#ifndef SECTION
|
|
# error SECTION is not defined!
|
|
#endif
|
|
|
|
.section SECTION(.text),"ax",@progbits
|
|
#if VEC_SIZE == 16 && IS_IN (libc)
|
|
ENTRY (__bzero)
|
|
mov %RDI_LP, %RAX_LP /* Set return value. */
|
|
mov %RSI_LP, %RDX_LP /* Set n. */
|
|
pxor %xmm0, %xmm0
|
|
jmp L(entry_from_bzero)
|
|
END (__bzero)
|
|
weak_alias (__bzero, bzero)
|
|
#endif
|
|
|
|
#if IS_IN (libc)
|
|
# if defined SHARED
|
|
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
|
cmp %RDX_LP, %RCX_LP
|
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
|
# endif
|
|
|
|
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
shl $2, %RDX_LP
|
|
WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
jmp L(entry_from_bzero)
|
|
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
#endif
|
|
|
|
#if defined SHARED && IS_IN (libc)
|
|
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
|
cmp %RDX_LP, %RCX_LP
|
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
|
#endif
|
|
|
|
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
|
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
# ifdef __ILP32__
|
|
/* Clear the upper 32 bits. */
|
|
mov %edx, %edx
|
|
# endif
|
|
L(entry_from_bzero):
|
|
cmpq $VEC_SIZE, %rdx
|
|
jb L(less_vec)
|
|
cmpq $(VEC_SIZE * 2), %rdx
|
|
ja L(more_2x_vec)
|
|
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
VMOVU %VEC(0), (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
|
END (MEMSET_SYMBOL (__memset, unaligned))
|
|
|
|
# if VEC_SIZE == 16
|
|
ENTRY (__memset_chk_erms)
|
|
cmp %RDX_LP, %RCX_LP
|
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
END (__memset_chk_erms)
|
|
|
|
/* Only used to measure performance of REP STOSB. */
|
|
ENTRY (__memset_erms)
|
|
/* Skip zero length. */
|
|
test %RDX_LP, %RDX_LP
|
|
jnz L(stosb)
|
|
movq %rdi, %rax
|
|
ret
|
|
# else
|
|
/* Provide a hidden symbol to debugger. */
|
|
.hidden MEMSET_SYMBOL (__memset, erms)
|
|
ENTRY (MEMSET_SYMBOL (__memset, erms))
|
|
# endif
|
|
L(stosb):
|
|
/* Issue vzeroupper before rep stosb. */
|
|
VZEROUPPER
|
|
mov %RDX_LP, %RCX_LP
|
|
movzbl %sil, %eax
|
|
mov %RDI_LP, %RDX_LP
|
|
rep stosb
|
|
mov %RDX_LP, %RAX_LP
|
|
ret
|
|
# if VEC_SIZE == 16
|
|
END (__memset_erms)
|
|
# else
|
|
END (MEMSET_SYMBOL (__memset, erms))
|
|
# endif
|
|
|
|
# if defined SHARED && IS_IN (libc)
|
|
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
|
cmp %RDX_LP, %RCX_LP
|
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
|
# endif
|
|
|
|
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
# ifdef __ILP32__
|
|
/* Clear the upper 32 bits. */
|
|
mov %edx, %edx
|
|
# endif
|
|
cmp $VEC_SIZE, %RDX_LP
|
|
jb L(less_vec)
|
|
cmp $(VEC_SIZE * 2), %RDX_LP
|
|
ja L(stosb_more_2x_vec)
|
|
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
VMOVU %VEC(0), (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
|
|
L(stosb_more_2x_vec):
|
|
cmpq $REP_STOSB_THRESHOLD, %rdx
|
|
ja L(stosb)
|
|
#endif
|
|
L(more_2x_vec):
|
|
cmpq $(VEC_SIZE * 4), %rdx
|
|
ja L(loop_start)
|
|
VMOVU %VEC(0), (%rdi)
|
|
VMOVU %VEC(0), VEC_SIZE(%rdi)
|
|
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
|
L(return):
|
|
VZEROUPPER
|
|
ret
|
|
|
|
L(loop_start):
|
|
leaq (VEC_SIZE * 4)(%rdi), %rcx
|
|
VMOVU %VEC(0), (%rdi)
|
|
andq $-(VEC_SIZE * 4), %rcx
|
|
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
VMOVU %VEC(0), VEC_SIZE(%rdi)
|
|
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
|
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
|
|
VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
|
|
VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
|
|
VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
|
|
addq %rdi, %rdx
|
|
andq $-(VEC_SIZE * 4), %rdx
|
|
cmpq %rdx, %rcx
|
|
je L(return)
|
|
L(loop):
|
|
VMOVA %VEC(0), (%rcx)
|
|
VMOVA %VEC(0), VEC_SIZE(%rcx)
|
|
VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
|
|
VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
|
|
addq $(VEC_SIZE * 4), %rcx
|
|
cmpq %rcx, %rdx
|
|
jne L(loop)
|
|
VZEROUPPER_SHORT_RETURN
|
|
ret
|
|
L(less_vec):
|
|
/* Less than 1 VEC. */
|
|
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
|
# error Unsupported VEC_SIZE!
|
|
# endif
|
|
# if VEC_SIZE > 32
|
|
cmpb $32, %dl
|
|
jae L(between_32_63)
|
|
# endif
|
|
# if VEC_SIZE > 16
|
|
cmpb $16, %dl
|
|
jae L(between_16_31)
|
|
# endif
|
|
MOVQ %xmm0, %rcx
|
|
cmpb $8, %dl
|
|
jae L(between_8_15)
|
|
cmpb $4, %dl
|
|
jae L(between_4_7)
|
|
cmpb $1, %dl
|
|
ja L(between_2_3)
|
|
jb 1f
|
|
movb %cl, (%rdi)
|
|
1:
|
|
VZEROUPPER
|
|
ret
|
|
# if VEC_SIZE > 32
|
|
/* From 32 to 63. No branch when size == 32. */
|
|
L(between_32_63):
|
|
vmovdqu %ymm0, -32(%rdi,%rdx)
|
|
vmovdqu %ymm0, (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
# endif
|
|
# if VEC_SIZE > 16
|
|
/* From 16 to 31. No branch when size == 16. */
|
|
L(between_16_31):
|
|
vmovdqu %xmm0, -16(%rdi,%rdx)
|
|
vmovdqu %xmm0, (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
# endif
|
|
/* From 8 to 15. No branch when size == 8. */
|
|
L(between_8_15):
|
|
movq %rcx, -8(%rdi,%rdx)
|
|
movq %rcx, (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
L(between_4_7):
|
|
/* From 4 to 7. No branch when size == 4. */
|
|
movl %ecx, -4(%rdi,%rdx)
|
|
movl %ecx, (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
L(between_2_3):
|
|
/* From 2 to 3. No branch when size == 2. */
|
|
movw %cx, -2(%rdi,%rdx)
|
|
movw %cx, (%rdi)
|
|
VZEROUPPER
|
|
ret
|
|
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|