glibc/sysdeps/x86_64/multiarch/strncat-evex.S

526 lines
12 KiB
ArmAsm

/* {wcs|str}ncat with 256/512-bit EVEX.
Copyright (C) 2022-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
#if ISA_SHOULD_BUILD (4)
/* Use evex-masked stores for small sizes. Turned off at the
moment. */
# define USE_EVEX_MASKED_STORE 0
# include <sysdep.h>
# ifndef VEC_SIZE
# include "x86-evex256-vecs.h"
# endif
# ifndef STRNCAT
# define STRNCAT __strncat_evex
# endif
# ifdef USE_AS_WCSCPY
# define MOVCHAR movl
# define VMOVU_MASK vmovdqu32
# define VPMIN vpminud
# define VPTESTN vptestnmd
# define VPTEST vptestmd
# define VPCMPEQ vpcmpeqd
# define CHAR_SIZE 4
# define REP_MOVS rep movsd
# define VMASK_REG VR10
# define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst
# define USE_WIDE_CHAR
# else
# define MOVCHAR movb
# define VMOVU_MASK vmovdqu8
# define VPMIN vpminub
# define VPTESTN vptestnmb
# define VPTEST vptestmb
# define VPCMPEQ vpcmpeqb
# define CHAR_SIZE 1
# define REP_MOVS rep movsb
# define VMASK_REG VRCX
# define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst
# endif
# include "strncpy-or-cat-overflow-def.h"
# include "reg-macros.h"
# define VZERO VMM(7)
# define VZERO_128 VMM_128(7)
# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text), "ax", @progbits
ENTRY(STRNCAT)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
movq %rdi, %rax
/* NB: It's safe to filter out zero-length strings WITHOUT
setting null-term. Destination MUST be a null-terminated
string so essentially the work is already done. */
# ifdef USE_AS_WCSCPY
leaq -1(%rdx), %rcx
shrq $56, %rcx
jnz L(zero_len)
# else
test %rdx, %rdx
jle L(zero_len)
# endif
# include "strcat-strlen-evex.h.S"
movl %esi, %ecx
andl $(PAGE_SIZE - 1), %ecx
cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
ja L(page_cross)
L(page_cross_continue):
VMOVU (%rsi), %VMM(0)
VPTESTN %VMM(0), %VMM(0), %k0
/* If USE_EVEX_MASK_STORE is enabled then we just handle length
<= CHAR_PER_VEC with masked instructions (which have
potential for dramatically bad perf if dst splits a page and
is not in the TLB). */
# if USE_EVEX_MASKED_STORE
KMOV %k0, %VRCX
FIND_FIRST_ONE (VRCX, VR8)
cmpq %r8, %rdx
jbe L(less_1x_vec)
test %VRCX, %VRCX
jz L(more_1x_vec)
blsmsk %VRCX, %VRCX
KMOV %VRCX, %k1
VMOVU_MASK %VMM(0), (%rdi){%k1}
ret
L(less_1x_vec):
mov $-1, %VRCX
bzhi %VRDX, %VRCX, %VRCX
KMOV %VRCX, %k1
MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
VMOVU_MASK %VMM(0), (%rdi){%k1}
ret
# else
KMOV %k0, %VMASK_REG
/* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
%VMASK_REG, %VRCX` for wcsncat. */
FIND_FIRST_ONE (VMASK_REG, VRCX)
cmpq %rcx, %rdx
jbe L(less_1x_vec)
/* If there were no zero-CHARs (rcx was zero before
FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
cmpl $CHAR_PER_VEC, %ecx
je L(more_1x_vec)
movl %ecx, %edx
L(less_1x_vec):
# if VEC_SIZE == 64
cmpl $(32 / CHAR_SIZE), %edx
jae L(copy_32_63)
# endif
cmpl $(16 / CHAR_SIZE), %edx
jae L(copy_16_31)
cmpl $(8 / CHAR_SIZE), %edx
jae L(copy_8_15)
# ifdef USE_AS_WCSCPY
vmovd %VMM_128(0), (%rdi)
MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
ret
# else
cmpl $4, %edx
jae L(copy_4_7)
movzbl (%rsi), %ecx
cmpl $1, %edx
jbe L(set_null_term)
movzwl 1(%rsi), %esi
movw %si, 1(%rdi)
.p2align 4,, 1
L(set_null_term):
movb %cl, (%rdi)
MOVCHAR $0, (%rdi, %rdx)
ret
# endif
# if VEC_SIZE == 64
.p2align 4,, 6
L(copy_32_63):
VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
VMOVU %VMM_256(0), (%rdi)
VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
ret
# endif
.p2align 4,, 6
L(copy_16_31):
/* Use xmm1 explicitly here as it won't require a `vzeroupper`
and will save code size. */
vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
VMOVU %VMM_128(0), (%rdi)
vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
ret
.p2align 4,, 2
L(copy_8_15):
movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx
vmovq %VMM_128(0), (%rdi)
movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
ret
# ifndef USE_AS_WCSCPY
.p2align 4,, 12
L(copy_4_7):
movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx
vmovd %VMM_128(0), (%rdi)
movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
ret
# endif
# endif
.p2align 4,, 4
L(zero_len):
# ifdef USE_AS_WCSCPY
test %rdx, %rdx
# endif
jne OVERFLOW_STRCAT
ret
.p2align 4,, 8
L(more_1x_vec):
VMOVU %VMM(0), (%rdi)
/* We are going to align rsi here so will need to be able to re-
adjust rdi/rdx afterwards. NB: We filtered out huge lengths
so rsi + rdx * CHAR_SIZE cannot overflow. */
leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
subq %rsi, %rdi
andq $-(VEC_SIZE), %rsi
L(loop_last_4x_vec):
addq %rsi, %rdi
subq %rsi, %rdx
# ifdef USE_AS_WCSCPY
shrq $2, %rdx
# endif
/* Will need this regardless. */
VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
VPTESTN %VMM(1), %VMM(1), %k0
KMOV %k0, %VMASK_REG
cmpq $(CHAR_PER_VEC * 2), %rdx
ja L(more_2x_vec)
L(last_2x_vec):
FIND_FIRST_ONE (VMASK_REG, VRCX)
cmpl %ecx, %edx
jbe L(ret_vec_x1_len)
/* If there were no zero-CHARs (rcx was zero before
FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
cmpl $CHAR_PER_VEC, %ecx
jne L(ret_vec_x1)
VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
addl $-CHAR_PER_VEC, %edx
bzhi %VRDX, %VRCX, %VR8
jz L(ret_vec_x2_len)
L(ret_vec_x2):
bsf %VRCX, %VRDX
L(ret_vec_x2_len):
VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
ret
.p2align 4,, 4
L(ret_vec_x1_len):
movl %edx, %ecx
L(ret_vec_x1):
VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
MOVCHAR $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
VZEROUPPER_RETURN
.p2align 4,, 8
L(last_4x_vec):
addl $-(CHAR_PER_VEC * 4), %edx
VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
VPTESTN %VMM(1), %VMM(1), %k0
KMOV %k0, %VMASK_REG
subq $-(VEC_SIZE * 4), %rsi
subq $-(VEC_SIZE * 4), %rdi
cmpl $(CHAR_PER_VEC * 2), %edx
jbe L(last_2x_vec)
.p2align 4,, 8
L(more_2x_vec):
# ifdef USE_AS_WCSCPY
xorl %ecx, %ecx
# endif
bsf %VMASK_REG, %VRCX
jnz L(ret_vec_x1)
VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x2)
VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
VPTESTN %VMM(3), %VMM(3), %k0
KMOV %k0, %VMASK_REG
cmpq $(CHAR_PER_VEC * 4), %rdx
ja L(more_4x_vec)
/* Adjust length before going to L(ret_vec_x3_len) or
L(ret_vec_x3). */
addl $(CHAR_PER_VEC * -2), %edx
FIND_FIRST_ONE (VMASK_REG, VRCX)
cmpl %ecx, %edx
jbe L(ret_vec_x3_len)
/* If there were no zero-CHARs (rcx was zero before
FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
cmpl $CHAR_PER_VEC, %ecx
jne L(ret_vec_x3)
VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
VPTESTN %VMM(4), %VMM(4), %k0
KMOV %k0, %VRCX
addl $-CHAR_PER_VEC, %edx
bzhi %VRDX, %VRCX, %VR8
jz L(ret_vec_x4_len)
L(ret_vec_x4):
bsf %VRCX, %VRDX
L(ret_vec_x4_len):
VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
MOVCHAR $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
ret
.p2align 4,, 4
L(ret_vec_x3_len):
movl %edx, %ecx
L(ret_vec_x3):
VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
ret
.p2align 4,, 8
L(more_4x_vec):
# ifdef USE_AS_WCSCPY
xorl %ecx, %ecx
# endif
bsf %VMASK_REG, %VRCX
jnz L(ret_vec_x3)
VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
VPTESTN %VMM(4), %VMM(4), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x4)
VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
/* Check if we are near the end before aligning. */
cmpq $(CHAR_PER_VEC * 8), %rdx
jbe L(last_4x_vec)
/* Add rsi to rdx (length) before aligning rsi. NB: Since we
filtered out huge lengths this cannot overflow. */
# ifdef USE_AS_WCSCPY
leaq (%rsi, %rdx, CHAR_SIZE), %rdx
# else
addq %rsi, %rdx
# endif
/* Subtract rsi from rdi before aligning (add back will have
correct rdi for aligned rsi). */
subq %rsi, %rdi
subq $-(VEC_SIZE * 5), %rsi
andq $(VEC_SIZE * -4), %rsi
/* Load first half of the loop before entry. */
VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
VPMIN %VMM(0), %VMM(1), %VMM(4)
VPMIN %VMM(2), %VMM(3), %VMM(6)
VPTESTN %VMM(4), %VMM(4), %k2
VPTESTN %VMM(6), %VMM(6), %k4
/* Offset rsi by VEC_SIZE so that we can jump to
L(loop_last_4x_vec). */
addq $-(VEC_SIZE), %rsi
KORTEST %k2, %k4
jnz L(loop_4x_done)
/* Store loop end in r9. */
leaq -(VEC_SIZE * 5)(%rdx), %r9
.p2align 4,, 11
L(loop_4x_vec):
VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
subq $(VEC_SIZE * -4), %rsi
cmpq %rsi, %r9
jbe L(loop_last_4x_vec)
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
VPMIN %VMM(0), %VMM(1), %VMM(4)
VPMIN %VMM(2), %VMM(3), %VMM(6)
VPTESTN %VMM(4), %VMM(4), %k2
VPTESTN %VMM(6), %VMM(6), %k4
KORTEST %k2, %k4
jz L(loop_4x_vec)
L(loop_4x_done):
VPTESTN %VMM(0), %VMM(0), %k0
KMOV %k0, %VRCX
/* Restore rdi (dst). */
addq %rsi, %rdi
/* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
test with bsf. */
bsf %VRCX, %VRCX
jnz L(ret_vec_x1)
VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
KMOV %k2, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x2)
VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
bsf %VRCX, %VRCX
jnz L(ret_vec_x3)
VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
KMOV %k4, %VRCX
bsf %VRCX, %VRCX
VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
ret
.p2align 4,, 4
L(page_cross):
movq %rsi, %r8
andq $(VEC_SIZE * -1), %r8
VPCMPEQ (%r8), %VZERO, %k0
# ifdef USE_AS_WCSCPY
KMOV %k0, %VR9
shrl $2, %ecx
andl $(CHAR_PER_VEC - 1), %ecx
shrx %VRCX, %VR9, %VRCX
# else
KMOV %k0, %VRCX
shrx %VRSI, %VRCX, %VRCX
# endif
subl %esi, %r8d
andl $(VEC_SIZE - 1), %r8d
# ifdef USE_AS_WCSCPY
shrl $2, %r8d
# endif
cmpq %r8, %rdx
jbe L(page_cross_small)
/* Optimizing more for space as this is very cold code. This
saves 2x cache lines. */
/* This adds once to the later result which will get correct
copy bounds. NB: this can never zero-out a non-zero RCX as
to be in the page cross case rsi cannot be aligned and we
already right-shift rcx by the misalignment. */
shl %VRCX
jz L(page_cross_continue)
bsf %VRCX, %VRCX
REP_MOVS
ret
L(page_cross_small):
tzcnt %VRCX, %VRCX
jz L(page_cross_setz)
cmpl %edx, %ecx
cmova %edx, %ecx
# ifdef USE_AS_WCSCPY
rep movsd
# else
rep movsb
# endif
L(page_cross_setz):
MOVCHAR $0, (%rdi)
ret
END(STRNCAT)
#endif