glibc/sysdeps/x86_64/multiarch/strncpy-evex.S
H.J. Lu e5672763c4 x86-64 strncpy: Properly handle the length parameter [BZ# 29839]
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits.  The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.

This pach fixes strncpy for x32.  Tested on x86-64 and x32.  On x86-64,
libc.so is the same with and without the fix.
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
2022-12-02 08:18:41 -08:00

995 lines
22 KiB
ArmAsm

/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
#if ISA_SHOULD_BUILD (4)
/* Use evex-masked stores for small sizes. Turned off at the
moment. */
# define USE_EVEX_MASKED_STORE 0
# include <sysdep.h>
# ifndef VEC_SIZE
# include "x86-evex256-vecs.h"
# endif
# ifndef STRNCPY
# define STRNCPY __strncpy_evex
# endif
# ifdef USE_AS_WCSCPY
# define VMOVU_MASK vmovdqu32
# define VPCMPEQ vpcmpeqd
# define VPMIN vpminud
# define VPTESTN vptestnmd
# define VPTEST vptestmd
# define CHAR_SIZE 4
# define REP_MOVS rep movsd
# define REP_STOS rep stosl
# define USE_WIDE_CHAR
# else
# define VMOVU_MASK vmovdqu8
# define VPCMPEQ vpcmpeqb
# define VPMIN vpminub
# define VPTESTN vptestnmb
# define VPTEST vptestmb
# define CHAR_SIZE 1
# define REP_MOVS rep movsb
# define REP_STOS rep stosb
# endif
# include "strncpy-or-cat-overflow-def.h"
# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
# include "reg-macros.h"
# define VZERO VMM(7)
# define VZERO_256 VMM_256(7)
# define VZERO_128 VMM_128(7)
# if VEC_SIZE == 64
# define VZERO_HALF VZERO_256
# else
# define VZERO_HALF VZERO_128
# endif
.section SECTION(.text), "ax", @progbits
ENTRY(STRNCPY)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
/* Filter zero length strings and very long strings. Zero
length strings just return, very long strings are handled by
just running rep stos{b|l} to zero set (which will almost
certainly segfault), if that succeeds then just calling
OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
# ifdef USE_AS_WCSCPY
decq %rdx
movq %rdx, %rax
/* 56 is end of max supported address space. */
shr $56, %rax
jnz L(zero_len)
# else
decq %rdx
/* If the flag needs to become `jb` replace `dec` with `sub`.
*/
jl L(zero_len)
# endif
vpxorq %VZERO_128, %VZERO_128, %VZERO_128
movl %esi, %eax
andl $(PAGE_SIZE - 1), %eax
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(page_cross)
L(page_cross_continue):
VMOVU (%rsi), %VMM(0)
VPTESTN %VMM(0), %VMM(0), %k0
KMOV %k0, %VRCX
/* If no STPCPY just save end ahead of time. */
# ifndef USE_AS_STPCPY
movq %rdi, %rax
# endif
cmpq $(CHAR_PER_VEC), %rdx
/* If USE_EVEX_MASK_STORE is enabled then we just handle length
<= CHAR_PER_VEC with masked instructions (which have
potential for dramatically bad perf if dst splits a page and
is not in the TLB). */
# if USE_EVEX_MASKED_STORE
/* `jae` because length rdx is now length - 1. */
jae L(more_1x_vec)
/* If there where multiple zero-CHAR matches in the first VEC,
VRCX will be overset but thats fine since any oversets where
at zero-positions anyways. */
# ifdef USE_AS_STPCPY
tzcnt %VRCX, %VRAX
cmpl %eax, %edx
cmovb %edx, %eax
# ifdef USE_AS_WCSCPY
adcl $0, %eax
leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
adcq %rdi, %rax
# endif
# endif
dec %VRCX
/* Zero out all non-zero CHAR's after the first zero match. */
KMOV %VRCX, %k1
/* Use VZERO as destination so this can be reused for
L(zfill_less_vec) (which if jumped to by subsequent logic
will have zerod out VZERO. */
VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
L(zfill_less_vec):
/* Get mask for what we need to set. */
incl %edx
mov $-1, %VRCX
bzhi %VRDX, %VRCX, %VRCX
KMOV %VRCX, %k1
VMOVU_MASK %VZERO, (%rdi){%k1}
ret
.p2align 4,, 4
L(zero_len):
cmpq $-1, %rdx
jne L(best_effort_strncpy)
movq %rdi, %rax
ret
.p2align 4,, 8
L(more_1x_vec):
# else
/* `jb` because length rdx is now length - 1. */
jb L(less_1x_vec)
# endif
/* This may overset but thats fine because we still need to zero
fill. */
VMOVU %VMM(0), (%rdi)
/* Length must be >= CHAR_PER_VEC so match here means we must
zero-fill. */
test %VRCX, %VRCX
jnz L(zfill)
/* We are going to align rsi here so will need to be able to re-
adjust rdi/rdx afterwords. NB: We filtered out huge lengths
so rsi + rdx * CHAR_SIZE cannot overflow. */
leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
subq %rsi, %rdi
andq $-(VEC_SIZE), %rsi
L(loop_last_4x_vec):
addq %rsi, %rdi
subq %rsi, %rdx
# ifdef USE_AS_WCSCPY
shrq $2, %rdx
# endif
VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
VPTESTN %VMM(1), %VMM(1), %k0
KMOV %k0, %VRCX
/* -1 because of the `dec %rdx` earlier. */
cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
ja L(more_2x_vec)
L(last_2x_vec):
/* This will be need to be computed no matter what. We do it
ahead of time for CHAR_PER_VEC == 64 because we can't adjust
the value of `tzcnt` with a shift. */
# if CHAR_PER_VEC == 64
tzcntq %rcx, %rcx
# endif
cmpl $(CHAR_PER_VEC), %edx
jb L(ret_vec_x1_len)
/* Seperate logic for CHAR_PER_VEC == 64 because we already did
`tzcnt` on VRCX. */
# if CHAR_PER_VEC == 64
/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */
cmpb $CHAR_PER_VEC, %cl
jnz L(ret_vec_x1_no_bsf)
# else
test %VRCX, %VRCX
jnz L(ret_vec_x1)
# endif
VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
KMOV %k0, %VRCX
# if CHAR_PER_VEC < 64
/* This essentiallys adds CHAR_PER_VEC to computed result. */
shlq $CHAR_PER_VEC, %rcx
# else
tzcntq %rcx, %rcx
addl $CHAR_PER_VEC, %ecx
# endif
.p2align 4,, 4
L(ret_vec_x1_len):
/* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
already been done. */
# if CHAR_PER_VEC < 64
tzcntq %rcx, %rcx
# endif
cmpl %ecx, %edx
jbe L(ret_vec_x1_len_no_zfill)
/* Fall through (expectation) is copy len < buffer len. */
VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
L(ret_vec_x1_len_no_zfill_mov):
movl %ecx, %edx
# ifdef USE_AS_STPCPY
/* clear flags. */
xorl %ecx, %ecx
# endif
L(ret_vec_x1_len_no_zfill):
VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
adcq $0, %rdx
leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
# else
leal (VEC_SIZE)(%rdx), %eax
adcq %rdi, %rax
# endif
# endif
ret
.p2align 4,, 10
L(ret_vec_x1):
bsf %VRCX, %VRCX
L(ret_vec_x1_no_bsf):
VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
subl %ecx, %edx
cmpl $CHAR_PER_VEC, %edx
jb L(ret_vec_x1_len_no_zfill_mov)
/* Fall through (expectation) is copy len < buffer len. */
VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
.p2align 4,, 8
L(last_4x_vec):
/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
$(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
using `movzbl`. */
# if CHAR_PER_VEC == 64
movzbl %dl, %edx
# else
andl $(CHAR_PER_VEC * 4 - 1), %edx
# endif
VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
VPTESTN %VMM(1), %VMM(1), %k0
KMOV %k0, %VRCX
subq $-(VEC_SIZE * 4), %rsi
subq $-(VEC_SIZE * 4), %rdi
cmpl $(CHAR_PER_VEC * 2 - 1), %edx
jbe L(last_2x_vec)
.p2align 4,, 8
L(more_2x_vec):
VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
test %VRCX, %VRCX
/* Must fill at least 2x VEC. */
jnz L(zfill_vec1)
VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
/* Must fill at least 1x VEC. */
jnz L(zfill_vec2)
VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
VPTESTN %VMM(3), %VMM(3), %k0
KMOV %k0, %VRCX
/* Check if len is more 4x VEC. -1 because rdx is len - 1. */
cmpq $(CHAR_PER_VEC * 4 - 1), %rdx
ja L(more_4x_vec)
subl $(CHAR_PER_VEC * 3), %edx
jb L(ret_vec_x3_len)
test %VRCX, %VRCX
jnz L(ret_vec_x3)
VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
KMOV %k0, %VRCX
tzcnt %VRCX, %VRCX
cmpl %ecx, %edx
jbe L(ret_vec_x4_len_no_zfill)
/* Fall through (expectation) is copy len < buffer len. */
VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
movl %ecx, %edx
L(ret_vec_x4_len_no_zfill):
VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
adcq $0, %rdx
leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
# else
leal (VEC_SIZE * 4 + 0)(%rdx), %eax
adcq %rdi, %rax
# endif
# endif
ret
L(ret_vec_x3_len):
addl $(CHAR_PER_VEC * 1), %edx
tzcnt %VRCX, %VRCX
cmpl %ecx, %edx
jbe L(ret_vec_x3_len_no_zfill)
/* Fall through (expectation) is copy len < buffer len. */
VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
L(ret_vec_x3_len_no_zfill_mov):
movl %ecx, %edx
# ifdef USE_AS_STPCPY
/* clear flags. */
xorl %ecx, %ecx
# endif
.p2align 4,, 4
L(ret_vec_x3_len_no_zfill):
VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
adcq $0, %rdx
leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
# else
leal (VEC_SIZE * 3 + 0)(%rdx), %eax
adcq %rdi, %rax
# endif
# endif
ret
.p2align 4,, 8
L(ret_vec_x3):
bsf %VRCX, %VRCX
VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
subl %ecx, %edx
jl L(ret_vec_x3_len_no_zfill_mov)
VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
.p2align 4,, 8
L(more_4x_vec):
VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
test %VRCX, %VRCX
jnz L(zfill_vec3)
VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
VPTESTN %VMM(4), %VMM(4), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(zfill_vec4)
/* Recheck length before aligning. */
cmpq $(CHAR_PER_VEC * 8 - 1), %rdx
jbe L(last_4x_vec)
/* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */
# ifdef USE_AS_WCSCPY
leaq (%rsi, %rdx, CHAR_SIZE), %rdx
# else
addq %rsi, %rdx
# endif
subq %rsi, %rdi
subq $-(VEC_SIZE * 5), %rsi
andq $(VEC_SIZE * -4), %rsi
/* Load first half of the loop before entry. */
VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
VPMIN %VMM(0), %VMM(1), %VMM(4)
VPMIN %VMM(2), %VMM(3), %VMM(6)
VPTESTN %VMM(4), %VMM(4), %k2
VPTESTN %VMM(6), %VMM(6), %k4
/* Offset rsi by VEC_SIZE so that we can jump to
L(loop_last_4x_vec). */
addq $-(VEC_SIZE), %rsi
KORTEST %k2, %k4
jnz L(loop_4x_done)
/* Store loop end in r9. */
leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
.p2align 4,, 11
L(loop_4x_vec):
VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
subq $(VEC_SIZE * -4), %rsi
cmpq %rsi, %r9
jbe L(loop_last_4x_vec)
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
VPMIN %VMM(0), %VMM(1), %VMM(4)
VPMIN %VMM(2), %VMM(3), %VMM(6)
VPTESTN %VMM(4), %VMM(4), %k2
VPTESTN %VMM(6), %VMM(6), %k4
KORTEST %k2, %k4
jz L(loop_4x_vec)
L(loop_4x_done):
/* Restore rdx (length). */
subq %rsi, %rdx
# ifdef USE_AS_WCSCPY
shrq $2, %rdx
# endif
VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
/* Restore rdi (dst). */
addq %rsi, %rdi
VPTESTN %VMM(0), %VMM(0), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(zfill_vec1)
VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
KMOV %k2, %VRCX
test %VRCX, %VRCX
jnz L(zfill_vec2)
VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(zfill_vec3)
VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
KMOV %k4, %VRCX
// Zfill more....
.p2align 4,, 4
L(zfill_vec4):
subq $(VEC_SIZE * -2), %rdi
addq $(CHAR_PER_VEC * -2), %rdx
L(zfill_vec2):
subq $(VEC_SIZE * -2), %rdi
addq $(CHAR_PER_VEC * -1), %rdx
L(zfill):
/* VRCX must be non-zero. */
bsf %VRCX, %VRCX
/* Adjust length / dst for zfill. */
subq %rcx, %rdx
# ifdef USE_AS_WCSCPY
leaq (%rdi, %rcx, CHAR_SIZE), %rdi
# else
addq %rcx, %rdi
# endif
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
L(zfill_from_page_cross):
/* From here on out its just memset(rdi, 0, rdx). */
cmpq $CHAR_PER_VEC, %rdx
jb L(zfill_less_vec)
L(zfill_more_1x_vec):
VMOVU %VZERO, (%rdi)
VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
ja L(zfill_more_2x_vec)
L(zfill_done0):
ret
/* Coming from vec1/vec2 we must be able to zfill at least 2x
VEC. */
.p2align 4,, 8
L(zfill_vec3):
subq $(VEC_SIZE * -2), %rdi
addq $(CHAR_PER_VEC * -2), %rdx
.p2align 4,, 2
L(zfill_vec1):
bsfq %rcx, %rcx
/* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
*/
leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
subq %rcx, %rdx
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
VMOVU %VZERO, (%rdi)
VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
cmpq $(CHAR_PER_VEC * 2), %rdx
jb L(zfill_done0)
L(zfill_more_2x_vec):
VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
VMOVU %VZERO, (VEC_SIZE)(%rdi)
subq $(CHAR_PER_VEC * 4 - 1), %rdx
jbe L(zfill_done)
# ifdef USE_AS_WCSCPY
leaq (%rdi, %rdx, CHAR_SIZE), %rdx
# else
addq %rdi, %rdx
# endif
VMOVU %VZERO, (VEC_SIZE * 2)(%rdi)
VMOVU %VZERO, (VEC_SIZE * 3)(%rdi)
VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
subq $-(VEC_SIZE * 4), %rdi
cmpq %rdi, %rdx
jbe L(zfill_done)
/* Align rdi and zfill loop. */
andq $-(VEC_SIZE), %rdi
.p2align 4,, 12
L(zfill_loop_4x_vec):
VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
subq $-(VEC_SIZE * 4), %rdi
cmpq %rdi, %rdx
ja L(zfill_loop_4x_vec)
L(zfill_done):
ret
/* Less 1x VEC case if we are not using evex masked store. */
# if !USE_EVEX_MASKED_STORE
.p2align 4,, 8
L(copy_1x):
/* Special case for copy 1x. It can be handled quickly and many
buffer sizes have convenient alignment. */
VMOVU %VMM(0), (%rdi)
/* If no zeros then we are done. */
testl %ecx, %ecx
jz L(ret_1x_1x)
/* Need to zfill, not we know that length <= CHAR_PER_VEC so we
only handle the small case here. */
bsf %VRCX, %VRCX
L(zfill_less_vec_no_bsf):
/* Adjust length / dst then just zfill less_vec. */
subq %rcx, %rdx
# ifdef USE_AS_WCSCPY
leaq (%rdi, %rcx, CHAR_SIZE), %rdi
# else
addq %rcx, %rdi
# endif
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
L(zfill_less_vec):
cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx
jb L(zfill_less_half)
VMOVU %VZERO_HALF, (%rdi)
VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
ret
# ifdef USE_AS_STPCPY
L(ret_1x_1x):
leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
ret
# endif
# if VEC_SIZE == 64
.p2align 4,, 4
L(copy_32_63):
/* Overfill to avoid branches. */
VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
VMOVU %VMM_256(0), (%rdi)
VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
/* We are taking advantage of the fact that to be here we must
be writing null-term as (%rdi, %rcx) we have a byte of lee-
way for overwriting. */
cmpl %ecx, %edx
ja L(zfill_less_vec_no_bsf)
# ifndef USE_AS_STPCPY
L(ret_1x_1x):
# else
# ifdef USE_AS_WCSCPY
adcq $0, %rdx
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# else
movl %edx, %eax
adcq %rdi, %rax
# endif
# endif
ret
# endif
.p2align 4,, 4
L(copy_16_31):
/* Overfill to avoid branches. */
vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
VMOVU %VMM_128(0), (%rdi)
vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
cmpl %ecx, %edx
/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
we have a larger copy block for 32-63 so this is just falls
through to zfill 16-31. If VEC_SIZE == 32 then we check for
full zfill of less 1x VEC. */
# if VEC_SIZE == 64
jbe L(ret_16_31)
subl %ecx, %edx
# ifdef USE_AS_WCSCPY
leaq (%rdi, %rcx, CHAR_SIZE), %rdi
# else
addq %rcx, %rdi
# endif
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
L(zfill_less_half):
L(zfill_less_32):
cmpl $(16 / CHAR_SIZE), %edx
jb L(zfill_less_16)
VMOVU %VZERO_128, (%rdi)
VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
ret
# endif
L(ret_16_31):
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
adcq $0, %rdx
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# else
movl %edx, %eax
adcq %rdi, %rax
# endif
# endif
ret
# else
/* VEC_SIZE == 32 begins. */
ja L(zfill_less_vec_no_bsf)
# ifndef USE_AS_STPCPY
L(ret_1x_1x):
# else
# ifdef USE_AS_WCSCPY
adcq $0, %rdx
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# else
movl %edx, %eax
adcq %rdi, %rax
# endif
# endif
ret
# endif
.p2align 4,, 4
L(copy_8_15):
/* Overfill to avoid branches. */
movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
vmovq %VMM_128(0), (%rdi)
movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
cmpl %ecx, %edx
jbe L(ret_8_15)
subl %ecx, %edx
# ifdef USE_AS_WCSCPY
leaq (%rdi, %rcx, CHAR_SIZE), %rdi
# else
addq %rcx, %rdi
# endif
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
.p2align 4,, 8
# if VEC_SIZE == 32
L(zfill_less_half):
# endif
L(zfill_less_16):
xorl %ecx, %ecx
cmpl $(8 / CHAR_SIZE), %edx
jb L(zfill_less_8)
movq %rcx, (%rdi)
movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
# ifndef USE_AS_STPCPY
L(ret_8_15):
# endif
ret
.p2align 4,, 8
L(less_1x_vec):
je L(copy_1x)
/* We will need `tzcnt` result for all other copy sizes. */
tzcnt %VRCX, %VRCX
# if VEC_SIZE == 64
cmpl $(32 / CHAR_SIZE), %edx
jae L(copy_32_63)
# endif
cmpl $(16 / CHAR_SIZE), %edx
jae L(copy_16_31)
cmpl $(8 / CHAR_SIZE), %edx
jae L(copy_8_15)
# ifdef USE_AS_WCSCPY
testl %ecx, %ecx
jz L(zfill_less_8_set_ret)
movl (%rsi, %rdx, CHAR_SIZE), %esi
vmovd %VMM_128(0), (%rdi)
movl %esi, (%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
cmpl %ecx, %edx
L(ret_8_15):
adcq $0, %rdx
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# endif
ret
L(zfill_less_8_set_ret):
xorl %ecx, %ecx
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
L(zfill_less_8):
movl %ecx, (%rdi)
movl %ecx, (%rdi, %rdx, CHAR_SIZE)
ret
# else
cmpl $3, %edx
jb L(copy_0_3)
/* Overfill to avoid branches. */
movl -3(%rsi, %rdx), %esi
vmovd %VMM_128(0), (%rdi)
movl %esi, -3(%rdi, %rdx)
cmpl %ecx, %edx
jbe L(ret_4_7)
subq %rcx, %rdx
addq %rcx, %rdi
# ifdef USE_AS_STPCPY
movq %rdi, %rax
# endif
xorl %ecx, %ecx
.p2align 4,, 8
L(zfill_less_8):
cmpl $3, %edx
jb L(zfill_less_3)
movl %ecx, (%rdi)
movl %ecx, -3(%rdi, %rdx)
# ifdef USE_AS_STPCPY
ret
# endif
L(ret_4_7):
# ifdef USE_AS_STPCPY
L(ret_8_15):
movl %edx, %eax
adcq %rdi, %rax
# endif
ret
.p2align 4,, 4
L(zfill_less_3):
testl %edx, %edx
jz L(zfill_1)
movw %cx, (%rdi)
L(zfill_1):
movb %cl, (%rdi, %rdx)
ret
.p2align 4,, 8
L(copy_0_3):
vmovd %VMM_128(0), %r8d
testl %edx, %edx
jz L(copy_1)
movw %r8w, (%rdi)
cmpl %ecx, %edx
ja L(zfill_from_1)
movzbl (%rsi, %rdx), %r8d
# ifdef USE_AS_STPCPY
movl %edx, %eax
adcq %rdi, %rax
movb %r8b, (%rdi, %rdx)
ret
# endif
L(copy_1):
# ifdef USE_AS_STPCPY
movl %edx, %eax
cmpl %ecx, %edx
adcq %rdi, %rax
# endif
# ifdef USE_AS_WCSCPY
vmovd %VMM_128(0), (%rdi)
# else
movb %r8b, (%rdi, %rdx)
# endif
ret
# endif
# ifndef USE_AS_WCSCPY
.p2align 4,, 8
L(zfill_from_1):
# ifdef USE_AS_STPCPY
leaq (%rdi, %rcx), %rax
# endif
movw $0, -1(%rdi, %rdx)
ret
# endif
.p2align 4,, 4
L(zero_len):
incq %rdx
jne L(best_effort_strncpy)
movq %rdi, %rax
ret
# endif
.p2align 4,, 4
.p2align 6,, 8
L(page_cross):
movq %rsi, %rax
andq $(VEC_SIZE * -1), %rax
VPCMPEQ (%rax), %VZERO, %k0
KMOV %k0, %VRCX
# ifdef USE_AS_WCSCPY
movl %esi, %r8d
shrl $2, %r8d
andl $(CHAR_PER_VEC - 1), %r8d
shrx %VR8, %VRCX, %VRCX
# else
shrx %VRSI, %VRCX, %VRCX
# endif
/* Compute amount of bytes we checked. */
subl %esi, %eax
andl $(VEC_SIZE - 1), %eax
# ifdef USE_AS_WCSCPY
shrl $2, %eax
# endif
/* If rax > rdx then we are finishing the copy at the end of the
page. */
cmpq %rax, %rdx
jb L(page_cross_small)
/* If rcx is non-zero then continue. */
test %VRCX, %VRCX
jz L(page_cross_continue)
/* We found zero-CHAR so need to copy then zfill (we know we
didn't cover all of length here). */
bsf %VRCX, %VRCX
L(movsb_and_zfill):
incl %ecx
subq %rcx, %rdx
# ifdef USE_AS_STPCPY
leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
# else
movq %rdi, %rax
# endif
REP_MOVS
# ifdef USE_AS_WCSCPY
movl $0, (%rdi)
# else
movb $0, (%rdi)
# endif
jmp L(zfill_from_page_cross)
L(page_cross_small):
tzcnt %VRCX, %VRCX
cmpl %ecx, %edx
jbe L(page_cross_copy_only)
/* Do a zfill of the tail before copying. */
movq %rdi, %r9
xorl %eax, %eax
movl %ecx, %r8d
subl %ecx, %edx
leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
movl %edx, %ecx
REP_STOS
movq %r9, %rdi
movl %r8d, %edx
L(page_cross_copy_only):
leal 1(%rdx), %ecx
# ifdef USE_AS_STPCPY
# ifdef USE_AS_WCSCPY
adcl $0, %edx
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# else
movl %edx, %eax
adcq %rdi, %rax
# endif
# else
movq %rdi, %rax
# endif
REP_MOVS
ret
L(best_effort_strncpy):
movq %rdx, %rcx
xorl %eax, %eax
movq %rdi, %r8
/* The length is >= 2^63. We very much so expect to segfault at
rep stos. If that doesn't happen then just strcpy to finish.
*/
REP_STOS
movq %r8, %rdi
jmp OVERFLOW_STRCPY
END(STRNCPY)
#endif