glibc/sysdeps/x86_64/multiarch/strncpy-avx2.S

740 lines
16 KiB
ArmAsm

/* strncpy with AVX2
Copyright (C) 2022-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
#if ISA_SHOULD_BUILD (3)
# include <sysdep.h>
# ifndef VEC_SIZE
# include "x86-avx-vecs.h"
# endif
# ifndef STRNCPY
# define STRNCPY __strncpy_avx2
# endif
# ifdef USE_AS_WCSCPY
# define VPCMPEQ vpcmpeqd
# define VPMIN vpminud
# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
# define VPMIN vpminub
# define CHAR_SIZE 1
# endif
# include "strncpy-or-cat-overflow-def.h"
# define PAGE_SIZE 4096
# define VZERO VMM(7)
# define VZERO_128 VMM_128(7)
.section SECTION(.text), "ax", @progbits
ENTRY(STRNCPY)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
/* Filter zero length strings and very long strings. Zero
length strings just return, very long strings are handled by
just running rep stos{b|l} to zero set (which will almost
certainly segfault), if that succeeds then just calling
OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
# ifdef USE_AS_WCSCPY
decq %rdx
movq %rdx, %rax
/* 56 is end of max supported address space. */
shr $56, %rax
jnz L(zero_len)
salq $2, %rdx
# else
decq %rdx
/* `dec` can macrofuse with `jl`. If the flag needs to become
`jb` replace `dec` with `sub`. */
jl L(zero_len)
# endif
vpxor %VZERO_128, %VZERO_128, %VZERO_128
movl %esi, %eax
andl $(PAGE_SIZE - 1), %eax
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(page_cross)
L(page_cross_continue):
VMOVU (%rsi), %VMM(0)
VPCMPEQ %VMM(0), %VZERO, %VMM(6)
vpmovmskb %VMM(6), %ecx
/* If no STPCPY just save end ahead of time. */
# ifndef USE_AS_STPCPY
movq %rdi, %rax
# elif defined USE_AS_WCSCPY
/* Clear dependency as nearly all return code for wcpncpy uses
`setc %al`. */
xorl %eax, %eax
# endif
cmpq $(VEC_SIZE - CHAR_SIZE), %rdx
/* `jb` because length rdx is now length - CHAR_SIZE. */
jbe L(less_1x_vec)
/* This may overset but that's fine because we still need to zero
fill. */
VMOVU %VMM(0), (%rdi)
testl %ecx, %ecx
jnz L(zfill)
/* Align. */
addq %rsi, %rdx
subq %rsi, %rdi
orq $(VEC_SIZE - 1), %rsi
incq %rsi
L(last_4x_vec):
addq %rsi, %rdi
L(loop_last_4x_vec):
subq %rsi, %rdx
VMOVA 0(%rsi), %VMM(1)
VPCMPEQ %VMM(1), %VZERO, %VMM(6)
vpmovmskb %VMM(6), %ecx
cmpq $(VEC_SIZE * 2), %rdx
jae L(more_2x_vec)
cmpl $(VEC_SIZE), %edx
jb L(ret_vec_x1_len)
testl %ecx, %ecx
jnz L(ret_vec_x1)
VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(6)
VMOVU %VMM(1), (%rdi)
vpmovmskb %VMM(6), %ecx
shlq $VEC_SIZE, %rcx
L(ret_vec_x1_len):
tzcntq %rcx, %rcx
cmpl %ecx, %edx
jbe L(ret_vec_x1_len_no_zfill)
/* Fall through (expectation) is copy len < buffer len. */
VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
L(ret_vec_x1_len_no_zfill_mov):
movl %ecx, %edx
# ifdef USE_AS_STPCPY
/* clear flags. */
xorl %ecx, %ecx
# endif
L(ret_vec_x1_len_no_zfill):
VMOVU ((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
VMOVU %VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
setc %al
addq %rdx, %rdi
leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
movl %edx, %eax
adcq %rdi, %rax
# endif
# endif
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4,, 6
L(ret_vec_x1):
bsfl %ecx, %ecx
VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
subl %ecx, %edx
/* Check if we need to reload/store. */
cmpl $VEC_SIZE, %edx
jb L(ret_vec_x1_len_no_zfill_mov)
/* Otherwise safe to just store directly. */
VMOVU %VMM(1), (%rdi)
VMOVU %VZERO, (%rdi, %rcx)
# ifdef USE_AS_STPCPY
leaq (%rdi, %rcx), %rax
# endif
VZEROUPPER_RETURN
.p2align 4,, 12
L(more_2x_vec):
VMOVU %VMM(1), (%rdi)
testl %ecx, %ecx
/* Must fill at least 2x VEC. */
jnz L(zfill_vec1)
VMOVA VEC_SIZE(%rsi), %VMM(2)
VMOVU %VMM(2), VEC_SIZE(%rdi)
VPCMPEQ %VMM(2), %VZERO, %VMM(6)
vpmovmskb %VMM(6), %ecx
testl %ecx, %ecx
/* Must fill at least 1x VEC. */
jnz L(zfill_vec2)
VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3)
VPCMPEQ %VMM(3), %VZERO, %VMM(6)
vpmovmskb %VMM(6), %ecx
/* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
CHAR_SIZE. */
cmpq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
ja L(more_4x_vec)
subl $(VEC_SIZE * 3), %edx
jb L(ret_vec_x3_len)
testl %ecx, %ecx
jnz L(ret_vec_x3)
VPCMPEQ (VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
vpmovmskb %VMM(6), %ecx
tzcntl %ecx, %ecx
cmpl %ecx, %edx
jbe L(ret_vec_x4_len_no_zfill)
/* Fall through (expectation) is copy len < buffer len. */
VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
movl %ecx, %edx
L(ret_vec_x4_len_no_zfill):
VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
VMOVU %VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
setc %al
addq %rdx, %rdi
leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
# else
leal (VEC_SIZE * 3 + 0)(%edx), %eax
adcq %rdi, %rax
# endif
# endif
VZEROUPPER_RETURN
L(ret_vec_x3_len):
addl $(VEC_SIZE * 1), %edx
tzcntl %ecx, %ecx
cmpl %ecx, %edx
jbe L(ret_vec_x3_len_no_zfill)
/* Fall through (expectation) is copy len < buffer len. */
VMOVU %VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
L(ret_vec_x3_len_no_zfill_mov):
movl %ecx, %edx
# ifdef USE_AS_STPCPY
/* clear flags. */
xorl %ecx, %ecx
# endif
.p2align 4,, 4
L(ret_vec_x3_len_no_zfill):
VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
VMOVU %VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
setc %al
addq %rdx, %rdi
leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
# else
leal (VEC_SIZE * 2 + 0)(%rdx), %eax
adcq %rdi, %rax
# endif
# endif
VZEROUPPER_RETURN
.p2align 4,, 8
L(ret_vec_x3):
bsfl %ecx, %ecx
VMOVU %VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
subl %ecx, %edx
jl L(ret_vec_x3_len_no_zfill_mov)
VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
VMOVU %VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
# ifdef USE_AS_STPCPY
leaq (VEC_SIZE * 2)(%rdi, %rcx), %rax
# endif
VZEROUPPER_RETURN
.p2align 4,, 8
L(more_4x_vec):
VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
testl %ecx, %ecx
jnz L(zfill_vec3)
VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4)
VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi)
VPCMPEQ %VMM(4), %VZERO, %VMM(6)
vpmovmskb %VMM(6), %ecx
testl %ecx, %ecx
jnz L(zfill_vec4)
movq %rdx, %rcx
addq %rsi, %rdx
subq %rsi, %rdi
subq $-(VEC_SIZE * 4), %rsi
/* Recheck length before aligning. */
cmpq $(VEC_SIZE * 8 - CHAR_SIZE), %rcx
jbe L(last_4x_vec)
andq $(VEC_SIZE * -4), %rsi
/* Do first half of loop ahead of time so loop can just start by
storing. */
VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
VPMIN %VMM(0), %VMM(1), %VMM(4)
VPMIN %VMM(2), %VMM(3), %VMM(6)
VPMIN %VMM(4), %VMM(6), %VMM(6)
VPCMPEQ %VMM(6), %VZERO, %VMM(6)
vpmovmskb %VMM(6), %r8d
addq %rsi, %rdi
testl %r8d, %r8d
jnz L(loop_4x_done)
/* Use r9 as end register. */
leaq -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
.p2align 4,, 11
L(loop_4x_vec):
VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
subq $(VEC_SIZE * -4), %rsi
VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
subq $(VEC_SIZE * -4), %rdi
cmpq %rsi, %r9
jbe L(loop_last_4x_vec)
VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
VPMIN %VMM(0), %VMM(1), %VMM(4)
VPMIN %VMM(2), %VMM(3), %VMM(6)
VPMIN %VMM(4), %VMM(6), %VMM(6)
VPCMPEQ %VMM(6), %VZERO, %VMM(6)
vpmovmskb %VMM(6), %r8d
testl %r8d, %r8d
jz L(loop_4x_vec)
L(loop_4x_done):
subq %rsi, %rdx
VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
VPCMPEQ %VMM(0), %VZERO, %VMM(6)
vpmovmskb %VMM(6), %ecx
testl %ecx, %ecx
jnz L(zfill_vec1)
VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
VPCMPEQ %VMM(1), %VZERO, %VMM(6)
vpmovmskb %VMM(6), %ecx
testl %ecx, %ecx
jnz L(zfill_vec2)
VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
VPCMPEQ %VMM(2), %VZERO, %VMM(6)
vpmovmskb %VMM(6), %ecx
testl %ecx, %ecx
jnz L(zfill_vec3)
VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
movl %r8d, %ecx
// Zfill more....
.p2align 4,, 4
L(zfill_vec4):
addq $(VEC_SIZE * 2), %rdi
subq $(VEC_SIZE * 2), %rdx
L(zfill_vec2):
shlq $VEC_SIZE, %rcx
L(zfill):
bsfq %rcx, %rcx
subq %rcx, %rdx
addq %rcx, %rdi
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
L(zfill_from_page_cross):
cmpq $VEC_SIZE, %rdx
jb L(zfill_less_vec_vzeroupper)
L(zfill_more_1x_vec):
VMOVU %VZERO, CHAR_SIZE(%rdi)
VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
cmpq $(VEC_SIZE * 2), %rdx
jae L(zfill_more_2x_vec)
L(zfill_done0):
VZEROUPPER_RETURN
.p2align 4,, 8
L(zfill_vec3):
addq $(VEC_SIZE * 2), %rdi
subq $(VEC_SIZE * 2), %rdx
.p2align 4,, 2
L(zfill_vec1):
bsfl %ecx, %ecx
addq %rcx, %rdi
subq %rcx, %rdx
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
/* zfill from vec1/vec3 must have to set at least 2x VECS. */
VMOVU %VZERO, CHAR_SIZE(%rdi)
VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
cmpq $(VEC_SIZE * 2), %rdx
jb L(zfill_done0)
L(zfill_more_2x_vec):
VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
VMOVU %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
subq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
jbe L(zfill_done)
addq %rdi, %rdx
VMOVU %VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
VMOVU %VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
subq $-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
cmpq %rdi, %rdx
jbe L(zfill_done)
andq $-(VEC_SIZE), %rdi
.p2align 4,, 12
L(zfill_loop_4x_vec):
VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
subq $-(VEC_SIZE * 4), %rdi
cmpq %rdi, %rdx
ja L(zfill_loop_4x_vec)
L(zfill_done):
VZEROUPPER_RETURN
.p2align 4,, 8
L(copy_1x):
VMOVU %VMM(0), (%rdi)
testl %ecx, %ecx
jz L(ret_32_32)
L(zfill_less_vec):
bsfl %ecx, %ecx
L(zfill_less_vec_no_bsf):
subq %rcx, %rdx
addq %rcx, %rdi
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
L(zfill_less_vec_vzeroupper):
COND_VZEROUPPER
/* We are taking advantage of the fact that to be here we must
be writing null-term as (%rdi, %rcx) we have a byte of lee-
way for overwriting. */
cmpl $16, %edx
jb L(zfill_less_16)
VMOVU %VZERO_128, (%rdi)
VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
ret
# ifdef USE_AS_STPCPY
L(ret_32_32):
leaq CHAR_SIZE(%rdi, %rdx), %rax
VZEROUPPER_RETURN
# endif
.p2align 4,, 4
L(copy_16_31):
/* Overfill to avoid branches. */
vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
vmovdqu %xmm0, (%rdi)
vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
cmpl %ecx, %edx
ja L(zfill_less_vec_no_bsf)
# ifndef USE_AS_STPCPY
L(ret_32_32):
# else
# ifdef USE_AS_WCSCPY
setc %al
addq %rdx, %rdi
leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
movl %edx, %eax
adcq %rdi, %rax
# endif
# endif
VZEROUPPER_RETURN
.p2align 4,, 4
L(copy_8_15):
/* Overfill to avoid branches. */
movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
vmovq %xmm0, (%rdi)
movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
cmpl %ecx, %edx
jbe L(ret_8_15)
subq %rcx, %rdx
addq %rcx, %rdi
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
.p2align 4,, 8
L(zfill_less_16):
xorl %ecx, %ecx
cmpl $8, %edx
jb L(zfill_less_8)
movq %rcx, (%rdi)
movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
# ifndef USE_AS_STPCPY
L(ret_8_15):
# endif
ret
.p2align 4,, 8
L(less_1x_vec):
/* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
buffer sizes are aligned conventially. */
je L(copy_1x)
tzcntl %ecx, %ecx
cmpl $16, %edx
jae L(copy_16_31)
COND_VZEROUPPER
cmpl $8, %edx
jae L(copy_8_15)
# ifdef USE_AS_WCSCPY
testl %ecx, %ecx
jz L(zfill_less_8_set_ret)
movl (%rsi, %rdx), %esi
vmovd %xmm0, (%rdi)
movl %esi, (%rdi, %rdx)
# ifdef USE_AS_STPCPY
cmpl %ecx, %edx
L(ret_8_15):
setc %al
addq %rdx, %rdi
leaq (%rdi, %rax, CHAR_SIZE), %rax
# endif
ret
L(zfill_less_8_set_ret):
xorl %ecx, %ecx
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
L(zfill_less_8):
movl %ecx, (%rdi)
movl %ecx, (%rdi, %rdx)
ret
# else
cmpl $3, %edx
jb L(copy_0_3)
/* Overfill to avoid branches. */
movl -3(%rsi, %rdx), %esi
vmovd %xmm0, (%rdi)
movl %esi, -3(%rdi, %rdx)
cmpl %ecx, %edx
jbe L(ret_4_7)
subq %rcx, %rdx
addq %rcx, %rdi
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
xorl %ecx, %ecx
.p2align 4,, 8
L(zfill_less_8):
cmpl $3, %edx
jb L(zfill_less_3)
movl %ecx, (%rdi)
movl %ecx, -3(%rdi, %rdx)
# ifdef USE_AS_STPCPY
ret
# endif
L(ret_4_7):
# ifdef USE_AS_STPCPY
L(ret_8_15):
movl %edx, %eax
adcq %rdi, %rax
# endif
ret
.p2align 4,, 4
L(zfill_less_3):
testl %edx, %edx
jz L(zfill_1)
movw %cx, (%rdi)
L(zfill_1):
movb %cl, (%rdi, %rdx)
ret
.p2align 4,, 8
L(copy_0_3):
vmovd %xmm0, %r8d
testl %edx, %edx
jz L(copy_1)
movw %r8w, (%rdi)
cmpl %ecx, %edx
ja L(zfill_from_1)
movzbl (%rsi, %rdx), %r8d
# ifdef USE_AS_STPCPY
movl %edx, %eax
adcq %rdi, %rax
movb %r8b, (%rdi, %rdx)
ret
# endif
L(copy_1):
# ifdef USE_AS_STPCPY
movl %edx, %eax
cmpl %ecx, %edx
adcq %rdi, %rax
# endif
# ifdef USE_AS_WCSCPY
vmovd %xmm0, (%rdi)
# else
movb %r8b, (%rdi, %rdx)
# endif
ret
# endif
.p2align 4,, 2
L(zero_len):
movq %rdi, %rax
ret
# ifndef USE_AS_WCSCPY
.p2align 4,, 8
L(zfill_from_1):
# ifdef USE_AS_STPCPY
leaq (%rdi, %rcx), %rax
# endif
movw $0, -1(%rdi, %rdx)
ret
# endif
.p2align 4,, 4
.p2align 6,, 8
L(page_cross):
movq %rsi, %rax
andq $(VEC_SIZE * -1), %rax
VPCMPEQ (%rax), %VZERO, %VMM(6)
vpmovmskb %VMM(6), %ecx
shrxl %esi, %ecx, %ecx
subl %esi, %eax
andl $(VEC_SIZE - 1), %eax
cmpq %rax, %rdx
jb L(page_cross_small)
/* Optimizing more aggressively for space as this is very cold
code. This saves 2x cache lines. */
/* If rcx is non-zero then continue. */
shl $CHAR_SIZE, %ecx
jz L(page_cross_continue)
bsf %ecx, %ecx
subq %rcx, %rdx
# ifdef USE_AS_STPCPY
leaq -CHAR_SIZE(%rdi, %rcx), %rax
# else
movq %rdi, %rax
# endif
rep movsb
# ifdef USE_AS_WCSCPY
movl $0, (%rdi)
# else
movb $0, (%rdi)
# endif
jmp L(zfill_from_page_cross)
L(page_cross_small):
tzcntl %ecx, %ecx
xorl %eax, %eax
cmpl %ecx, %edx
jbe L(page_cross_copy_only)
/* Do a zfill of the tail before copying. */
movq %rdi, %r9
movl %ecx, %r8d
subl %ecx, %edx
leaq CHAR_SIZE(%rdi, %rcx), %rdi
movl %edx, %ecx
rep stosb
movq %r9, %rdi
movl %r8d, %edx
L(page_cross_copy_only):
leal CHAR_SIZE(%rdx), %ecx
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
setc %al
addq %rdi, %rdx
leaq (%rdx, %rax, CHAR_SIZE), %rax
# else
movl %edx, %eax
adcq %rdi, %rax
# endif
# else
movq %rdi, %rax
# endif
rep movsb
ret
L(best_effort_strncpy):
movq %rdx, %rcx
xorl %eax, %eax
movq %rdi, %r8
/* The length is >= 2^63. We very much so expect to segfault at
rep stos. If that doesn't happen then just strcpy to finish.
*/
# ifdef USE_AS_WCSCPY
rep stosl
# else
rep stosb
# endif
movq %r8, %rdi
jmp OVERFLOW_STRCPY
END(STRNCPY)
#endif