glibc/sysdeps/x86_64/multiarch/strcpy-evex.S

543 lines
12 KiB
ArmAsm

/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
Copyright (C) 2021-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
#if ISA_SHOULD_BUILD (4)
/* Use evex-masked stores for small sizes. Turned off at the
moment. */
# define USE_EVEX_MASKED_STORE 0
/* Use movsb in page cross case to save code size. */
# define USE_MOVSB_IN_PAGE_CROSS 1
# include <sysdep.h>
# ifndef VEC_SIZE
# include "x86-evex256-vecs.h"
# endif
# ifndef STRCPY
# define STRCPY __strcpy_evex
# endif
# ifdef USE_AS_WCSCPY
# define VMOVU_MASK vmovdqu32
# define VPMIN vpminud
# define VPTESTN vptestnmd
# define VPTEST vptestmd
# define VPCMPEQ vpcmpeqd
# define CHAR_SIZE 4
# define REP_MOVS rep movsd
# define USE_WIDE_CHAR
# else
# define VMOVU_MASK vmovdqu8
# define VPMIN vpminub
# define VPTESTN vptestnmb
# define VPTEST vptestmb
# define VPCMPEQ vpcmpeqb
# define CHAR_SIZE 1
# define REP_MOVS rep movsb
# endif
# include "reg-macros.h"
# ifdef USE_AS_STPCPY
# define END_REG rax
# else
# define END_REG rdi, %rdx, CHAR_SIZE
# endif
# ifdef USE_AS_STRCAT
# define PAGE_ALIGN_REG edx
# define PAGE_ALIGN_REG_64 rdx
# else
# define PAGE_ALIGN_REG eax
# define PAGE_ALIGN_REG_64 rax
# endif
# define VZERO VMM(7)
# define VZERO_128 VMM_128(7)
# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text), "ax", @progbits
ENTRY(STRCPY)
# ifdef USE_AS_STRCAT
movq %rdi, %rax
# include "strcat-strlen-evex.h.S"
# endif
movl %esi, %PAGE_ALIGN_REG
andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
ja L(page_cross)
L(page_cross_continue):
VMOVU (%rsi), %VMM(0)
# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
movq %rdi, %rax
# endif
/* Two short string implementations. One with traditional
branching approach and one with masked instructions (which
have potential for dramatically bad perf if dst splits a
page and is not in the TLB). */
# if USE_EVEX_MASKED_STORE
VPTEST %VMM(0), %VMM(0), %k0
KMOV %k0, %VRCX
# ifdef USE_AS_WCSCPY
subl $((1 << CHAR_PER_VEC)- 1), %VRCX
# else
inc %VRCX
# endif
jz L(more_1x_vec)
KMOV %VRCX, %k1
KXOR %k0, %k1, %k1
VMOVU_MASK %VMM(0), (%rdi){%k1}
# ifdef USE_AS_STPCPY
bsf %VRCX, %VRCX
leaq (%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
# else
VPTESTN %VMM(0), %VMM(0), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jz L(more_1x_vec)
xorl %edx, %edx
bsf %VRCX, %VRDX
# ifdef USE_AS_STPCPY
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# endif
/* Use mask bits in rcx to detect which copy we need. If the low
mask is zero then there must be a bit set in the upper half.
I.e if rcx != 0 and ecx == 0, then match must be upper 32
bits so we use L(copy_32_63). */
# if VEC_SIZE == 64
# ifdef USE_AS_WCSCPY
testb %cl, %cl
# else
testl %ecx, %ecx
# endif
jz L(copy_32_63)
# endif
# ifdef USE_AS_WCSCPY
testb $0xf, %cl
# else
testw %cx, %cx
# endif
jz L(copy_16_31)
# ifdef USE_AS_WCSCPY
testb $0x3, %cl
# else
testb %cl, %cl
# endif
jz L(copy_8_15)
# ifdef USE_AS_WCSCPY
vmovd %VMM_128(0), (%rdi)
/* No need to copy, we know its zero. */
movl $0, (%END_REG)
ret
# else
testb $0x7, %cl
jz L(copy_4_7)
test %edx, %edx
jz L(set_null_term)
/* NB: make this `vmovw` if support for AVX512-FP16 is added.
*/
vmovd %VMM_128(0), %esi
movw %si, (%rdi)
.p2align 4,, 1
L(set_null_term):
/* No need to copy, we know its zero. */
movb $0, (%END_REG)
ret
# endif
# if VEC_SIZE == 64
.p2align 4,, 6
L(copy_32_63):
VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
VMOVU %VMM_256(0), (%rdi)
VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
ret
# endif
.p2align 4,, 6
L(copy_16_31):
/* Use xmm1 explicitly here as it won't require a `vzeroupper`
and will save code size. */
vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
VMOVU %VMM_128(0), (%rdi)
vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
ret
.p2align 4,, 8
L(copy_8_15):
# ifdef USE_AS_WCSCPY
movl -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
# else
movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
# endif
vmovq %VMM_128(0), (%rdi)
movq %rcx, -(8 - CHAR_SIZE)(%END_REG)
ret
# endif
# ifndef USE_AS_WCSCPY
.p2align 4,, 12
L(copy_4_7):
movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
vmovd %VMM_128(0), (%rdi)
movl %ecx, -(4 - CHAR_SIZE)(%END_REG)
ret
# endif
.p2align 4,, 8
L(more_1x_vec):
# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
VMOVU %VMM(0), (%rdi)
# endif
subq %rsi, %rdi
andq $-(VEC_SIZE), %rsi
addq %rsi, %rdi
VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
/* Ideally we store after moves to minimize impact of potential
false-dependencies. */
# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
VMOVU %VMM(0), (%rax)
# endif
VPTESTN %VMM(1), %VMM(1), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x1)
VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
VMOVU %VMM(1), VEC_SIZE(%rdi)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x2)
VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
VPTESTN %VMM(3), %VMM(3), %k0
KMOV %k0, %VRDX
test %VRDX, %VRDX
jnz L(ret_vec_x3)
VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
VPTESTN %VMM(4), %VMM(4), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x4)
VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
/* Align for 4x loop. */
subq %rsi, %rdi
/* + VEC_SIZE * 5 because we never added the original VEC_SIZE
we covered before aligning. */
subq $-(VEC_SIZE * 5), %rsi
andq $-(VEC_SIZE * 4), %rsi
/* Load first half of the loop before entry. */
VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
VPMIN %VMM(0), %VMM(1), %VMM(4)
VPMIN %VMM(2), %VMM(3), %VMM(6)
VPTESTN %VMM(4), %VMM(4), %k2
VPTESTN %VMM(6), %VMM(6), %k4
KORTEST %k2, %k4
jnz L(loop_4x_done)
.p2align 4,, 11
L(loop_4x_vec):
VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
subq $(VEC_SIZE * -4), %rsi
VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
VPMIN %VMM(0), %VMM(1), %VMM(4)
VPMIN %VMM(2), %VMM(3), %VMM(6)
VPTESTN %VMM(4), %VMM(4), %k2
VPTESTN %VMM(6), %VMM(6), %k4
KORTEST %k2, %k4
jz L(loop_4x_vec)
L(loop_4x_done):
VPTESTN %VMM(0), %VMM(0), %k0
KMOV %k0, %VRCX
/* Restore rdi (%rdi). */
addq %rsi, %rdi
test %VRCX, %VRCX
jnz L(ret_vec_x0_end)
VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
KMOV %k2, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x1)
VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
VPTESTN %VMM(2), %VMM(2), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x2)
VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
/* Place L(ret_vec_x4) here to save code size. We get a
meaningfuly benefit doing this for stpcpy. */
KMOV %k4, %VRDX
L(ret_vec_x3):
bsf %VRDX, %VRDX
VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
leaq (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
# endif
L(return_end):
ret
.p2align 4,, 6
L(ret_vec_x0_end):
bsf %VRCX, %VRCX
# ifdef USE_AS_STPCPY
leaq (%rdi, %rcx, CHAR_SIZE), %rax
# endif
inc %VRCX
VMOVU (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
ret
.p2align 4,, 8
L(ret_vec_x1):
bsf %VRCX, %VRCX
VMOVU (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
.p2align 4,, 4
L(ret_vec_x2):
bsf %VRCX, %VRCX
VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
/* ret_vec_x3 reuses return code after the loop. */
.p2align 4,, 6
L(ret_vec_x4):
bsf %VRCX, %VRCX
VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
.p2align 4,, 4
L(page_cross):
# ifndef USE_AS_STRCAT
vpxorq %VZERO_128, %VZERO_128, %VZERO_128
# endif
movq %rsi, %rcx
andq $(VEC_SIZE * -1), %rcx
VPCMPEQ (%rcx), %VZERO, %k0
KMOV %k0, %VRCX
# ifdef USE_AS_WCSCPY
andl $(VEC_SIZE - 1), %PAGE_ALIGN_REG
shrl $2, %PAGE_ALIGN_REG
# endif
shrx %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
# if USE_MOVSB_IN_PAGE_CROSS
/* Optimizing more aggressively for space as this is very cold
code. This saves 2x cache lines. */
/* This adds once to the later result which will get correct
copy bounds. NB: this can never zero-out a non-zero RCX as
to be in the page cross case rsi cannot be aligned and we
already right-shift rcx by the misalignment. */
shl %VRCX
jz L(page_cross_continue)
# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
movq %rdi, %rax
# endif
bsf %VRCX, %VRCX
REP_MOVS
# ifdef USE_AS_STPCPY
leaq -CHAR_SIZE(%rdi), %rax
# endif
ret
# else
/* Check if we found zero-char before end of page. */
test %VRCX, %VRCX
jz L(page_cross_continue)
/* Traditional copy case, essentially same as used in non-page-
cross case but since we can't reuse VMM(0) we need twice as
many loads from rsi. */
# ifndef USE_AS_STRCAT
xorl %edx, %edx
# endif
/* Dependency on rdi must already have been satisfied. */
bsf %VRCX, %VRDX
# ifdef USE_AS_STPCPY
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# elif !defined USE_AS_STRCAT
movq %rdi, %rax
# endif
# if VEC_SIZE == 64
# ifdef USE_AS_WCSCPY
testb %cl, %cl
# else
test %ecx, %ecx
# endif
jz L(page_cross_copy_32_63)
# endif
# ifdef USE_AS_WCSCPY
testb $0xf, %cl
# else
testw %cx, %cx
# endif
jz L(page_cross_copy_16_31)
# ifdef USE_AS_WCSCPY
testb $0x3, %cl
# else
testb %cl, %cl
# endif
jz L(page_cross_copy_8_15)
# ifdef USE_AS_WCSCPY
movl (%rsi), %esi
movl %esi, (%rdi)
movl $0, (%END_REG)
ret
# else
testb $0x7, %cl
jz L(page_cross_copy_4_7)
test %edx, %edx
jz L(page_cross_set_null_term)
movzwl (%rsi), %ecx
movw %cx, (%rdi)
L(page_cross_set_null_term):
movb $0, (%END_REG)
ret
.p2align 4,, 4
L(page_cross_copy_4_7):
movl (%rsi), %ecx
movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
movl %ecx, (%rdi)
movl %esi, -(4 - CHAR_SIZE)(%END_REG)
ret
# endif
# if VEC_SIZE == 64
.p2align 4,, 4
L(page_cross_copy_32_63):
VMOVU (%rsi), %VMM_256(0)
VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
VMOVU %VMM_256(0), (%rdi)
VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
ret
# endif
.p2align 4,, 4
L(page_cross_copy_16_31):
vmovdqu (%rsi), %xmm0
vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
vmovdqu %xmm0, (%rdi)
vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
ret
.p2align 4,, 4
L(page_cross_copy_8_15):
movq (%rsi), %rcx
movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
movq %rcx, (%rdi)
movq %rsi, -(8 - CHAR_SIZE)(%END_REG)
ret
# endif
END(STRCPY)
#endif