glibc/sysdeps/x86_64/multiarch/strcpy-avx2.S
Siddhesh Poyarekar 30891f35fa Remove "Contributed by" lines
We stopped adding "Contributed by" or similar lines in sources in 2012
in favour of git logs and keeping the Contributors section of the
glibc manual up to date.  Removing these lines makes the license
header a bit more consistent across files and also removes the
possibility of error in attribution when license blocks or files are
copied across since the contributed-by lines don't actually reflect
reality in those cases.

Move all "Contributed by" and similar lines (Written by, Test by,
etc.) into a new file CONTRIBUTED-BY to retain record of these
contributions.  These contributors are also mentioned in
manual/contrib.texi, so we just maintain this additional record as a
courtesy to the earlier developers.

The following scripts were used to filter a list of files to edit in
place and to clean up the CONTRIBUTED-BY file respectively.  These
were not added to the glibc sources because they're not expected to be
of any use in future given that this is a one time task:

https://gist.github.com/siddhesh/b5ecac94eabfd72ed2916d6d8157e7dc
https://gist.github.com/siddhesh/15ea1f5e435ace9774f485030695ee02

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
2021-09-03 22:06:44 +05:30

1001 lines
20 KiB
ArmAsm

/* strcpy with AVX2
Copyright (C) 2011-2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
# ifndef USE_AS_STRCAT
# include <sysdep.h>
# ifndef STRCPY
# define STRCPY __strcpy_avx2
# endif
# endif
/* Number of bytes in a vector register */
# ifndef VEC_SIZE
# define VEC_SIZE 32
# endif
# ifndef VZEROUPPER
# define VZEROUPPER vzeroupper
# endif
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
/* zero register */
#define xmmZ xmm0
#define ymmZ ymm0
/* mask register */
#define ymmM ymm1
# ifndef USE_AS_STRCAT
.section SECTION(.text),"ax",@progbits
ENTRY (STRCPY)
# ifdef USE_AS_STRNCPY
mov %RDX_LP, %R8_LP
test %R8_LP, %R8_LP
jz L(ExitZero)
# endif
mov %rsi, %rcx
# ifndef USE_AS_STPCPY
mov %rdi, %rax /* save result */
# endif
# endif
vpxor %xmmZ, %xmmZ, %xmmZ
and $((VEC_SIZE * 4) - 1), %ecx
cmp $(VEC_SIZE * 2), %ecx
jbe L(SourceStringAlignmentLessTwoVecSize)
and $-VEC_SIZE, %rsi
and $(VEC_SIZE - 1), %ecx
vpcmpeqb (%rsi), %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
shr %cl, %rdx
# ifdef USE_AS_STRNCPY
# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
mov $VEC_SIZE, %r10
sub %rcx, %r10
cmp %r10, %r8
# else
mov $(VEC_SIZE + 1), %r10
sub %rcx, %r10
cmp %r10, %r8
# endif
jbe L(CopyVecSizeTailCase2OrCase3)
# endif
test %edx, %edx
jnz L(CopyVecSizeTail)
vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
vpmovmskb %ymm2, %edx
# ifdef USE_AS_STRNCPY
add $VEC_SIZE, %r10
cmp %r10, %r8
jbe L(CopyTwoVecSizeCase2OrCase3)
# endif
test %edx, %edx
jnz L(CopyTwoVecSize)
vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */
vmovdqu %ymm2, (%rdi)
/* If source address alignment != destination address alignment */
.p2align 4
L(UnalignVecSizeBoth):
sub %rcx, %rdi
# ifdef USE_AS_STRNCPY
add %rcx, %r8
sbb %rcx, %rcx
or %rcx, %r8
# endif
mov $VEC_SIZE, %rcx
vmovdqa (%rsi, %rcx), %ymm2
vmovdqu %ymm2, (%rdi, %rcx)
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
vpcmpeqb %ymm2, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
sub $(VEC_SIZE * 3), %r8
jbe L(CopyVecSizeCase2OrCase3)
# endif
test %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec2)
# else
jnz L(CopyVecSize)
# endif
vmovdqu %ymm2, (%rdi, %rcx)
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
vpcmpeqb %ymm3, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
# endif
test %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec3)
# else
jnz L(CopyVecSize)
# endif
vmovdqu %ymm3, (%rdi, %rcx)
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
vpcmpeqb %ymm4, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
# endif
test %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec4)
# else
jnz L(CopyVecSize)
# endif
vmovdqu %ymm4, (%rdi, %rcx)
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
vpcmpeqb %ymm2, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
# endif
test %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec2)
# else
jnz L(CopyVecSize)
# endif
vmovdqu %ymm2, (%rdi, %rcx)
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
vpcmpeqb %ymm2, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
# endif
test %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec2)
# else
jnz L(CopyVecSize)
# endif
vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
vmovdqu %ymm2, (%rdi, %rcx)
vpcmpeqb %ymm3, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
# endif
test %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec3)
# else
jnz L(CopyVecSize)
# endif
vmovdqu %ymm3, (%rdi, %rcx)
mov %rsi, %rdx
lea VEC_SIZE(%rsi, %rcx), %rsi
and $-(VEC_SIZE * 4), %rsi
sub %rsi, %rdx
sub %rdx, %rdi
# ifdef USE_AS_STRNCPY
lea (VEC_SIZE * 8)(%r8, %rdx), %r8
# endif
L(UnalignedFourVecSizeLoop):
vmovdqa (%rsi), %ymm4
vmovdqa VEC_SIZE(%rsi), %ymm5
vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
vpminub %ymm5, %ymm4, %ymm2
vpminub %ymm7, %ymm6, %ymm3
vpminub %ymm2, %ymm3, %ymm3
vpcmpeqb %ymmM, %ymm3, %ymm3
vpmovmskb %ymm3, %edx
# ifdef USE_AS_STRNCPY
sub $(VEC_SIZE * 4), %r8
jbe L(UnalignedLeaveCase2OrCase3)
# endif
test %edx, %edx
jnz L(UnalignedFourVecSizeLeave)
L(UnalignedFourVecSizeLoop_start):
add $(VEC_SIZE * 4), %rdi
add $(VEC_SIZE * 4), %rsi
vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
vmovdqa (%rsi), %ymm4
vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
vmovdqa VEC_SIZE(%rsi), %ymm5
vpminub %ymm5, %ymm4, %ymm2
vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
vmovdqu %ymm7, -VEC_SIZE(%rdi)
vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
vpminub %ymm7, %ymm6, %ymm3
vpminub %ymm2, %ymm3, %ymm3
vpcmpeqb %ymmM, %ymm3, %ymm3
vpmovmskb %ymm3, %edx
# ifdef USE_AS_STRNCPY
sub $(VEC_SIZE * 4), %r8
jbe L(UnalignedLeaveCase2OrCase3)
# endif
test %edx, %edx
jz L(UnalignedFourVecSizeLoop_start)
L(UnalignedFourVecSizeLeave):
vpcmpeqb %ymm4, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
test %edx, %edx
jnz L(CopyVecSizeUnaligned_0)
vpcmpeqb %ymm5, %ymmZ, %ymmM
vpmovmskb %ymmM, %ecx
test %ecx, %ecx
jnz L(CopyVecSizeUnaligned_16)
vpcmpeqb %ymm6, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
test %edx, %edx
jnz L(CopyVecSizeUnaligned_32)
vpcmpeqb %ymm7, %ymmZ, %ymmM
vpmovmskb %ymmM, %ecx
bsf %ecx, %edx
vmovdqu %ymm4, (%rdi)
vmovdqu %ymm5, VEC_SIZE(%rdi)
vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
# ifdef USE_AS_STPCPY
lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
# endif
vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
add $(VEC_SIZE - 1), %r8
sub %rdx, %r8
lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
jmp L(StrncpyFillTailWithZero)
# else
add $(VEC_SIZE * 3), %rsi
add $(VEC_SIZE * 3), %rdi
jmp L(CopyVecSizeExit)
# endif
/* If source address alignment == destination address alignment */
L(SourceStringAlignmentLessTwoVecSize):
vmovdqu (%rsi), %ymm3
vmovdqu VEC_SIZE(%rsi), %ymm2
vpcmpeqb %ymm3, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
# ifdef USE_AS_STRNCPY
# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
cmp $VEC_SIZE, %r8
# else
cmp $(VEC_SIZE + 1), %r8
# endif
jbe L(CopyVecSizeTail1Case2OrCase3)
# endif
test %edx, %edx
jnz L(CopyVecSizeTail1)
vmovdqu %ymm3, (%rdi)
vpcmpeqb %ymm2, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
# ifdef USE_AS_STRNCPY
# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
cmp $(VEC_SIZE * 2), %r8
# else
cmp $((VEC_SIZE * 2) + 1), %r8
# endif
jbe L(CopyTwoVecSize1Case2OrCase3)
# endif
test %edx, %edx
jnz L(CopyTwoVecSize1)
and $-VEC_SIZE, %rsi
and $(VEC_SIZE - 1), %ecx
jmp L(UnalignVecSizeBoth)
/*------End of main part with loops---------------------*/
/* Case1 */
# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
.p2align 4
L(CopyVecSize):
add %rcx, %rdi
# endif
L(CopyVecSizeTail):
add %rcx, %rsi
L(CopyVecSizeTail1):
bsf %edx, %edx
L(CopyVecSizeExit):
cmp $32, %edx
jae L(Exit32_63)
cmp $16, %edx
jae L(Exit16_31)
cmp $8, %edx
jae L(Exit8_15)
cmp $4, %edx
jae L(Exit4_7)
cmp $3, %edx
je L(Exit3)
cmp $1, %edx
ja L(Exit2)
je L(Exit1)
movb $0, (%rdi)
# ifdef USE_AS_STPCPY
lea (%rdi), %rax
# endif
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $1, %r8
lea 1(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(CopyTwoVecSize1):
add $VEC_SIZE, %rsi
add $VEC_SIZE, %rdi
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $VEC_SIZE, %r8
# endif
jmp L(CopyVecSizeTail1)
.p2align 4
L(CopyTwoVecSize):
bsf %edx, %edx
add %rcx, %rsi
add $VEC_SIZE, %edx
sub %ecx, %edx
jmp L(CopyVecSizeExit)
.p2align 4
L(CopyVecSizeUnaligned_0):
bsf %edx, %edx
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
# ifdef USE_AS_STPCPY
lea (%rdi, %rdx), %rax
# endif
vmovdqu %ymm4, (%rdi)
add $((VEC_SIZE * 4) - 1), %r8
sub %rdx, %r8
lea 1(%rdi, %rdx), %rdi
jmp L(StrncpyFillTailWithZero)
# else
jmp L(CopyVecSizeExit)
# endif
.p2align 4
L(CopyVecSizeUnaligned_16):
bsf %ecx, %edx
vmovdqu %ymm4, (%rdi)
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
# ifdef USE_AS_STPCPY
lea VEC_SIZE(%rdi, %rdx), %rax
# endif
vmovdqu %ymm5, VEC_SIZE(%rdi)
add $((VEC_SIZE * 3) - 1), %r8
sub %rdx, %r8
lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
jmp L(StrncpyFillTailWithZero)
# else
add $VEC_SIZE, %rsi
add $VEC_SIZE, %rdi
jmp L(CopyVecSizeExit)
# endif
.p2align 4
L(CopyVecSizeUnaligned_32):
bsf %edx, %edx
vmovdqu %ymm4, (%rdi)
vmovdqu %ymm5, VEC_SIZE(%rdi)
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
# ifdef USE_AS_STPCPY
lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
# endif
vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
add $((VEC_SIZE * 2) - 1), %r8
sub %rdx, %r8
lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
jmp L(StrncpyFillTailWithZero)
# else
add $(VEC_SIZE * 2), %rsi
add $(VEC_SIZE * 2), %rdi
jmp L(CopyVecSizeExit)
# endif
# ifdef USE_AS_STRNCPY
# ifndef USE_AS_STRCAT
.p2align 4
L(CopyVecSizeUnalignedVec6):
vmovdqu %ymm6, (%rdi, %rcx)
jmp L(CopyVecSizeVecExit)
.p2align 4
L(CopyVecSizeUnalignedVec5):
vmovdqu %ymm5, (%rdi, %rcx)
jmp L(CopyVecSizeVecExit)
.p2align 4
L(CopyVecSizeUnalignedVec4):
vmovdqu %ymm4, (%rdi, %rcx)
jmp L(CopyVecSizeVecExit)
.p2align 4
L(CopyVecSizeUnalignedVec3):
vmovdqu %ymm3, (%rdi, %rcx)
jmp L(CopyVecSizeVecExit)
# endif
/* Case2 */
.p2align 4
L(CopyVecSizeCase2):
add $VEC_SIZE, %r8
add %rcx, %rdi
add %rcx, %rsi
bsf %edx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
jmp L(StrncpyExit)
.p2align 4
L(CopyTwoVecSizeCase2):
add %rcx, %rsi
bsf %edx, %edx
add $VEC_SIZE, %edx
sub %ecx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
jmp L(StrncpyExit)
L(CopyVecSizeTailCase2):
add %rcx, %rsi
bsf %edx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
jmp L(StrncpyExit)
L(CopyVecSizeTail1Case2):
bsf %edx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
jmp L(StrncpyExit)
/* Case2 or Case3, Case3 */
.p2align 4
L(CopyVecSizeCase2OrCase3):
test %rdx, %rdx
jnz L(CopyVecSizeCase2)
L(CopyVecSizeCase3):
add $VEC_SIZE, %r8
add %rcx, %rdi
add %rcx, %rsi
jmp L(StrncpyExit)
.p2align 4
L(CopyTwoVecSizeCase2OrCase3):
test %rdx, %rdx
jnz L(CopyTwoVecSizeCase2)
add %rcx, %rsi
jmp L(StrncpyExit)
.p2align 4
L(CopyVecSizeTailCase2OrCase3):
test %rdx, %rdx
jnz L(CopyVecSizeTailCase2)
add %rcx, %rsi
jmp L(StrncpyExit)
.p2align 4
L(CopyTwoVecSize1Case2OrCase3):
add $VEC_SIZE, %rdi
add $VEC_SIZE, %rsi
sub $VEC_SIZE, %r8
L(CopyVecSizeTail1Case2OrCase3):
test %rdx, %rdx
jnz L(CopyVecSizeTail1Case2)
jmp L(StrncpyExit)
# endif
/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
.p2align 4
L(Exit1):
movzwl (%rsi), %edx
mov %dx, (%rdi)
# ifdef USE_AS_STPCPY
lea 1(%rdi), %rax
# endif
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $2, %r8
lea 2(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER_RETURN
.p2align 4
L(Exit2):
movzwl (%rsi), %ecx
mov %cx, (%rdi)
movb $0, 2(%rdi)
# ifdef USE_AS_STPCPY
lea 2(%rdi), %rax
# endif
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $3, %r8
lea 3(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER_RETURN
.p2align 4
L(Exit3):
mov (%rsi), %edx
mov %edx, (%rdi)
# ifdef USE_AS_STPCPY
lea 3(%rdi), %rax
# endif
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $4, %r8
lea 4(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER_RETURN
.p2align 4
L(Exit4_7):
mov (%rsi), %ecx
mov %ecx, (%rdi)
mov -3(%rsi, %rdx), %ecx
mov %ecx, -3(%rdi, %rdx)
# ifdef USE_AS_STPCPY
lea (%rdi, %rdx), %rax
# endif
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub %rdx, %r8
sub $1, %r8
lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER_RETURN
.p2align 4
L(Exit8_15):
mov (%rsi), %rcx
mov -7(%rsi, %rdx), %r9
mov %rcx, (%rdi)
mov %r9, -7(%rdi, %rdx)
# ifdef USE_AS_STPCPY
lea (%rdi, %rdx), %rax
# endif
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub %rdx, %r8
sub $1, %r8
lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER_RETURN
.p2align 4
L(Exit16_31):
vmovdqu (%rsi), %xmm2
vmovdqu -15(%rsi, %rdx), %xmm3
vmovdqu %xmm2, (%rdi)
vmovdqu %xmm3, -15(%rdi, %rdx)
# ifdef USE_AS_STPCPY
lea (%rdi, %rdx), %rax
# endif
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub %rdx, %r8
sub $1, %r8
lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER_RETURN
.p2align 4
L(Exit32_63):
vmovdqu (%rsi), %ymm2
vmovdqu -31(%rsi, %rdx), %ymm3
vmovdqu %ymm2, (%rdi)
vmovdqu %ymm3, -31(%rdi, %rdx)
# ifdef USE_AS_STPCPY
lea (%rdi, %rdx), %rax
# endif
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub %rdx, %r8
sub $1, %r8
lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
VZEROUPPER_RETURN
# ifdef USE_AS_STRNCPY
.p2align 4
L(StrncpyExit1):
movzbl (%rsi), %edx
mov %dl, (%rdi)
# ifdef USE_AS_STPCPY
lea 1(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
movb $0, 1(%rdi)
# endif
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit2):
movzwl (%rsi), %edx
mov %dx, (%rdi)
# ifdef USE_AS_STPCPY
lea 2(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
movb $0, 2(%rdi)
# endif
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit3_4):
movzwl (%rsi), %ecx
movzwl -2(%rsi, %r8), %edx
mov %cx, (%rdi)
mov %dx, -2(%rdi, %r8)
# ifdef USE_AS_STPCPY
lea (%rdi, %r8), %rax
# endif
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit5_8):
mov (%rsi), %ecx
mov -4(%rsi, %r8), %edx
mov %ecx, (%rdi)
mov %edx, -4(%rdi, %r8)
# ifdef USE_AS_STPCPY
lea (%rdi, %r8), %rax
# endif
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit9_16):
mov (%rsi), %rcx
mov -8(%rsi, %r8), %rdx
mov %rcx, (%rdi)
mov %rdx, -8(%rdi, %r8)
# ifdef USE_AS_STPCPY
lea (%rdi, %r8), %rax
# endif
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit17_32):
vmovdqu (%rsi), %xmm2
vmovdqu -16(%rsi, %r8), %xmm3
vmovdqu %xmm2, (%rdi)
vmovdqu %xmm3, -16(%rdi, %r8)
# ifdef USE_AS_STPCPY
lea (%rdi, %r8), %rax
# endif
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit33_64):
/* 0/32, 31/16 */
vmovdqu (%rsi), %ymm2
vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
vmovdqu %ymm2, (%rdi)
vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
# ifdef USE_AS_STPCPY
lea (%rdi, %r8), %rax
# endif
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit65):
/* 0/32, 32/32, 64/1 */
vmovdqu (%rsi), %ymm2
vmovdqu 32(%rsi), %ymm3
mov 64(%rsi), %cl
vmovdqu %ymm2, (%rdi)
vmovdqu %ymm3, 32(%rdi)
mov %cl, 64(%rdi)
# ifdef USE_AS_STPCPY
lea 65(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
movb $0, 65(%rdi)
# endif
VZEROUPPER_RETURN
# ifndef USE_AS_STRCAT
.p2align 4
L(Fill1):
mov %dl, (%rdi)
VZEROUPPER_RETURN
.p2align 4
L(Fill2):
mov %dx, (%rdi)
VZEROUPPER_RETURN
.p2align 4
L(Fill3_4):
mov %dx, (%rdi)
mov %dx, -2(%rdi, %r8)
VZEROUPPER_RETURN
.p2align 4
L(Fill5_8):
mov %edx, (%rdi)
mov %edx, -4(%rdi, %r8)
VZEROUPPER_RETURN
.p2align 4
L(Fill9_16):
mov %rdx, (%rdi)
mov %rdx, -8(%rdi, %r8)
VZEROUPPER_RETURN
.p2align 4
L(Fill17_32):
vmovdqu %xmmZ, (%rdi)
vmovdqu %xmmZ, -16(%rdi, %r8)
VZEROUPPER_RETURN
.p2align 4
L(CopyVecSizeUnalignedVec2):
vmovdqu %ymm2, (%rdi, %rcx)
.p2align 4
L(CopyVecSizeVecExit):
bsf %edx, %edx
add $(VEC_SIZE - 1), %r8
add %rcx, %rdi
# ifdef USE_AS_STPCPY
lea (%rdi, %rdx), %rax
# endif
sub %rdx, %r8
lea 1(%rdi, %rdx), %rdi
.p2align 4
L(StrncpyFillTailWithZero):
xor %edx, %edx
sub $VEC_SIZE, %r8
jbe L(StrncpyFillExit)
vmovdqu %ymmZ, (%rdi)
add $VEC_SIZE, %rdi
mov %rdi, %rsi
and $(VEC_SIZE - 1), %esi
sub %rsi, %rdi
add %rsi, %r8
sub $(VEC_SIZE * 4), %r8
jb L(StrncpyFillLessFourVecSize)
L(StrncpyFillLoopVmovdqa):
vmovdqa %ymmZ, (%rdi)
vmovdqa %ymmZ, VEC_SIZE(%rdi)
vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
add $(VEC_SIZE * 4), %rdi
sub $(VEC_SIZE * 4), %r8
jae L(StrncpyFillLoopVmovdqa)
L(StrncpyFillLessFourVecSize):
add $(VEC_SIZE * 2), %r8
jl L(StrncpyFillLessTwoVecSize)
vmovdqa %ymmZ, (%rdi)
vmovdqa %ymmZ, VEC_SIZE(%rdi)
add $(VEC_SIZE * 2), %rdi
sub $VEC_SIZE, %r8
jl L(StrncpyFillExit)
vmovdqa %ymmZ, (%rdi)
add $VEC_SIZE, %rdi
jmp L(Fill)
.p2align 4
L(StrncpyFillLessTwoVecSize):
add $VEC_SIZE, %r8
jl L(StrncpyFillExit)
vmovdqa %ymmZ, (%rdi)
add $VEC_SIZE, %rdi
jmp L(Fill)
.p2align 4
L(StrncpyFillExit):
add $VEC_SIZE, %r8
L(Fill):
cmp $17, %r8d
jae L(Fill17_32)
cmp $9, %r8d
jae L(Fill9_16)
cmp $5, %r8d
jae L(Fill5_8)
cmp $3, %r8d
jae L(Fill3_4)
cmp $1, %r8d
ja L(Fill2)
je L(Fill1)
VZEROUPPER_RETURN
/* end of ifndef USE_AS_STRCAT */
# endif
.p2align 4
L(UnalignedLeaveCase2OrCase3):
test %rdx, %rdx
jnz L(UnalignedFourVecSizeLeaveCase2)
L(UnalignedFourVecSizeLeaveCase3):
lea (VEC_SIZE * 4)(%r8), %rcx
and $-VEC_SIZE, %rcx
add $(VEC_SIZE * 3), %r8
jl L(CopyVecSizeCase3)
vmovdqu %ymm4, (%rdi)
sub $VEC_SIZE, %r8
jb L(CopyVecSizeCase3)
vmovdqu %ymm5, VEC_SIZE(%rdi)
sub $VEC_SIZE, %r8
jb L(CopyVecSizeCase3)
vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
sub $VEC_SIZE, %r8
jb L(CopyVecSizeCase3)
vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
# ifdef USE_AS_STPCPY
lea (VEC_SIZE * 4)(%rdi), %rax
# endif
# ifdef USE_AS_STRCAT
movb $0, (VEC_SIZE * 4)(%rdi)
# endif
VZEROUPPER_RETURN
.p2align 4
L(UnalignedFourVecSizeLeaveCase2):
xor %ecx, %ecx
vpcmpeqb %ymm4, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
add $(VEC_SIZE * 3), %r8
jle L(CopyVecSizeCase2OrCase3)
test %edx, %edx
# ifndef USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec4)
# else
jnz L(CopyVecSize)
# endif
vpcmpeqb %ymm5, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
vmovdqu %ymm4, (%rdi)
add $VEC_SIZE, %rcx
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
test %edx, %edx
# ifndef USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec5)
# else
jnz L(CopyVecSize)
# endif
vpcmpeqb %ymm6, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
vmovdqu %ymm5, VEC_SIZE(%rdi)
add $VEC_SIZE, %rcx
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
test %edx, %edx
# ifndef USE_AS_STRCAT
jnz L(CopyVecSizeUnalignedVec6)
# else
jnz L(CopyVecSize)
# endif
vpcmpeqb %ymm7, %ymmZ, %ymmM
vpmovmskb %ymmM, %edx
vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
lea VEC_SIZE(%rdi, %rcx), %rdi
lea VEC_SIZE(%rsi, %rcx), %rsi
bsf %edx, %edx
cmp %r8d, %edx
jb L(CopyVecSizeExit)
L(StrncpyExit):
cmp $65, %r8d
je L(StrncpyExit65)
cmp $33, %r8d
jae L(StrncpyExit33_64)
cmp $17, %r8d
jae L(StrncpyExit17_32)
cmp $9, %r8d
jae L(StrncpyExit9_16)
cmp $5, %r8d
jae L(StrncpyExit5_8)
cmp $3, %r8d
jae L(StrncpyExit3_4)
cmp $1, %r8d
ja L(StrncpyExit2)
je L(StrncpyExit1)
# ifdef USE_AS_STPCPY
mov %rdi, %rax
# endif
# ifdef USE_AS_STRCAT
movb $0, (%rdi)
# endif
VZEROUPPER_RETURN
.p2align 4
L(ExitZero):
# ifndef USE_AS_STRCAT
mov %rdi, %rax
# endif
VZEROUPPER_RETURN
# endif
# ifndef USE_AS_STRCAT
END (STRCPY)
# else
END (STRCAT)
# endif
#endif