glibc/sysdeps/x86_64/multiarch/memchr-evex.S

710 lines
17 KiB
ArmAsm

/* memchr/wmemchr optimized with 256-bit EVEX instructions.
Copyright (C) 2021-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
#include <sysdep.h>
#if ISA_SHOULD_BUILD (4)
# ifndef VEC_SIZE
# include "x86-evex256-vecs.h"
# endif
# ifndef MEMCHR
# define MEMCHR __memchr_evex
# endif
# ifdef USE_AS_WMEMCHR
# define PC_SHIFT_GPR rcx
# define VPTESTN vptestnmd
# define VPBROADCAST vpbroadcastd
# define VPMINU vpminud
# define VPCMP vpcmpd
# define VPCMPEQ vpcmpeqd
# define CHAR_SIZE 4
# define USE_WIDE_CHAR
# else
# define PC_SHIFT_GPR rdi
# define VPTESTN vptestnmb
# define VPBROADCAST vpbroadcastb
# define VPMINU vpminub
# define VPCMP vpcmpb
# define VPCMPEQ vpcmpeqb
# define CHAR_SIZE 1
# endif
# include "reg-macros.h"
/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
doesn't have VEX encoding), use VEX encoding in loop so we
can use vpcmpeqb + vptern which is more efficient than the
EVEX alternative. */
# if defined USE_IN_RTM || VEC_SIZE == 64
# undef COND_VZEROUPPER
# undef VZEROUPPER_RETURN
# undef VZEROUPPER
# define COND_VZEROUPPER
# define VZEROUPPER_RETURN ret
# define VZEROUPPER
# define USE_TERN_IN_LOOP 0
# else
# define USE_TERN_IN_LOOP 1
# undef VZEROUPPER
# define VZEROUPPER vzeroupper
# endif
# if USE_TERN_IN_LOOP
/* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
so we don't want to multiply resulting index. */
# define TERN_CHAR_MULT 1
# ifdef USE_AS_WMEMCHR
# define TEST_END() inc %VRCX
# else
# define TEST_END() add %rdx, %rcx
# endif
# else
# define TERN_CHAR_MULT CHAR_SIZE
# define TEST_END() KORTEST %k2, %k3
# endif
# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
# ifndef USE_AS_WMEMCHR
# define GPR_X0_IS_RET 1
# else
# define GPR_X0_IS_RET 0
# endif
# define GPR_X0 rax
# else
# define GPR_X0_IS_RET 0
# define GPR_X0 rdx
# endif
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
# if CHAR_PER_VEC == 64
# define LAST_VEC_OFFSET (VEC_SIZE * 3)
# else
# define LAST_VEC_OFFSET (VEC_SIZE * 2)
# endif
# if CHAR_PER_VEC >= 32
# define MASK_GPR(...) VGPR(__VA_ARGS__)
# elif CHAR_PER_VEC == 16
# define MASK_GPR(reg) VGPR_SZ(reg, 16)
# else
# define MASK_GPR(reg) VGPR_SZ(reg, 8)
# endif
# define VMATCH VMM(0)
# define VMATCH_LO VMM_lo(0)
# define PAGE_SIZE 4096
.section SECTION(.text), "ax", @progbits
ENTRY_P2ALIGN (MEMCHR, 6)
/* Check for zero length. */
test %RDX_LP, %RDX_LP
jz L(zero_0)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
VPBROADCAST %esi, %VMATCH
/* Check if we may cross page boundary with one vector load. */
movl %edi, %eax
andl $(PAGE_SIZE - 1), %eax
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(page_cross)
VPCMPEQ (%rdi), %VMATCH, %k0
KMOV %k0, %VRAX
# ifndef USE_AS_WMEMCHR
/* If rcx is zero then tzcnt -> CHAR_PER_VEC. NB: there is a
already a dependency between rcx and rsi so no worries about
false-dep here. */
tzcnt %VRAX, %VRSI
/* If rdx <= rsi then either 1) rcx was non-zero (there was a
match) but it was out of bounds or 2) rcx was zero and rdx
was <= VEC_SIZE so we are done scanning. */
cmpq %rsi, %rdx
/* NB: Use branch to return zero/non-zero. Common usage will
branch on result of function (if return is null/non-null).
This branch can be used to predict the ensuing one so there
is no reason to extend the data-dependency with cmovcc. */
jbe L(zero_0)
/* If rcx is zero then len must be > RDX, otherwise since we
already tested len vs lzcnt(rcx) (in rsi) we are good to
return this match. */
test %VRAX, %VRAX
jz L(more_1x_vec)
leaq (%rdi, %rsi), %rax
# else
/* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
> 1 so if rcx is tzcnt != CHAR_PER_VEC. */
cmpq $CHAR_PER_VEC, %rdx
ja L(more_1x_vec)
tzcnt %VRAX, %VRAX
cmpl %eax, %edx
jbe L(zero_0)
L(first_vec_x0_ret):
leaq (%rdi, %rax, CHAR_SIZE), %rax
# endif
ret
/* Only fits in first cache line for VEC_SIZE == 32. */
# if VEC_SIZE == 32
.p2align 4,, 2
L(zero_0):
xorl %eax, %eax
ret
# endif
.p2align 4,, 9
L(more_1x_vec):
# ifdef USE_AS_WMEMCHR
/* If wmemchr still need to test if there was a match in first
VEC. Use bsf to test here so we can reuse
L(first_vec_x0_ret). */
bsf %VRAX, %VRAX
jnz L(first_vec_x0_ret)
# endif
L(page_cross_continue):
# ifdef USE_AS_WMEMCHR
/* We can't use end of the buffer to re-calculate length for
wmemchr as len * CHAR_SIZE may overflow. */
leaq -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
andq $(VEC_SIZE * -1), %rdi
subq %rdi, %rax
sarq $2, %rax
addq %rdx, %rax
# else
leaq -(VEC_SIZE + 1)(%rdx, %rdi), %rax
andq $(VEC_SIZE * -1), %rdi
subq %rdi, %rax
# endif
/* rax contains remaining length - 1. -1 so we can get imm8
encoding in a few additional places saving code size. */
/* Needed regardless of remaining length. */
VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
KMOV %k0, %VRDX
/* We cannot fold the above `sub %rdi, %rax` with the `cmp
$(CHAR_PER_VEC * 2), %rax` because its possible for a very
large length to overflow and cause the subtract to carry
despite length being above CHAR_PER_VEC * 2. */
cmpq $(CHAR_PER_VEC * 2 - 1), %rax
ja L(more_2x_vec)
L(last_2x_vec):
test %VRDX, %VRDX
jnz L(first_vec_x1_check)
/* Check the end of data. NB: use 8-bit operations to save code
size. We no longer need the full-width of eax and will
perform a write-only operation over eax so there will be no
partial-register stalls. */
subb $(CHAR_PER_VEC * 1 - 1), %al
jle L(zero_0)
VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
KMOV %k0, %VRCX
# ifdef USE_AS_WMEMCHR
/* For wmemchr against we can't take advantage of tzcnt(0) ==
VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */
test %VRCX, %VRCX
jz L(zero_0)
# endif
tzcnt %VRCX, %VRCX
cmp %cl, %al
/* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32. We give
fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
not enough space before the next cache line to fit the `lea`
for return. */
# if VEC_SIZE == 64
ja L(first_vec_x2_ret)
L(zero_0):
xorl %eax, %eax
ret
# else
jbe L(zero_0)
leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
ret
# endif
.p2align 4,, 5
L(first_vec_x1_check):
bsf %VRDX, %VRDX
cmpb %dl, %al
jb L(zero_4)
leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
ret
/* Fits at the end of the cache line here for VEC_SIZE == 32.
*/
# if VEC_SIZE == 32
L(zero_4):
xorl %eax, %eax
ret
# endif
.p2align 4,, 4
L(first_vec_x2):
bsf %VRCX, %VRCX
L(first_vec_x2_ret):
leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
ret
/* Fits at the end of the cache line here for VEC_SIZE == 64.
*/
# if VEC_SIZE == 64
L(zero_4):
xorl %eax, %eax
ret
# endif
.p2align 4,, 4
L(first_vec_x1):
bsf %VRDX, %VRDX
leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
ret
.p2align 4,, 5
L(more_2x_vec):
/* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
length. */
/* Already computed matches for first VEC in rdx. */
test %VRDX, %VRDX
jnz L(first_vec_x1)
VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(first_vec_x2)
/* Needed regardless of next length check. */
VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
KMOV %k0, %VRCX
/* Check if we are near the end. */
cmpq $(CHAR_PER_VEC * 4 - 1), %rax
ja L(more_4x_vec)
test %VRCX, %VRCX
jnz L(first_vec_x3_check)
/* Use 8-bit instructions to save code size. We won't use full-
width eax again and will perform a write-only operation to
eax so no worries about partial-register stalls. */
subb $(CHAR_PER_VEC * 3), %al
jb L(zero_2)
L(last_vec_check):
VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
KMOV %k0, %VRCX
# ifdef USE_AS_WMEMCHR
/* For wmemchr against we can't take advantage of tzcnt(0) ==
VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */
test %VRCX, %VRCX
jz L(zero_2)
# endif
tzcnt %VRCX, %VRCX
cmp %cl, %al
jae L(first_vec_x4_ret)
L(zero_2):
xorl %eax, %eax
ret
/* Fits at the end of the cache line here for VEC_SIZE == 64.
For VEC_SIZE == 32 we put the return label at the end of
L(first_vec_x4). */
# if VEC_SIZE == 64
L(first_vec_x4_ret):
leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
ret
# endif
.p2align 4,, 6
L(first_vec_x4):
bsf %VRCX, %VRCX
# if VEC_SIZE == 32
/* Place L(first_vec_x4_ret) here as we can't fit it in the same
cache line as where it is called from so we might as well
save code size by reusing return of L(first_vec_x4). */
L(first_vec_x4_ret):
# endif
leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
ret
.p2align 4,, 6
L(first_vec_x3_check):
/* Need to adjust remaining length before checking. */
addb $-(CHAR_PER_VEC * 2), %al
bsf %VRCX, %VRCX
cmpb %cl, %al
jb L(zero_2)
leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
ret
.p2align 4,, 6
L(first_vec_x3):
bsf %VRCX, %VRCX
leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
ret
.p2align 4,, 3
# if !USE_TERN_IN_LOOP
.p2align 4,, 10
# endif
L(more_4x_vec):
test %VRCX, %VRCX
jnz L(first_vec_x3)
VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(first_vec_x4)
subq $-(VEC_SIZE * 5), %rdi
subq $(CHAR_PER_VEC * 8), %rax
jb L(last_4x_vec)
# ifdef USE_AS_WMEMCHR
movl %edi, %ecx
# else
addq %rdi, %rax
# endif
# if VEC_SIZE == 64
/* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
processor has partial register stalls (all have merging
uop). If that changes this can be removed. */
xorb %dil, %dil
# else
andq $-(VEC_SIZE * 4), %rdi
# endif
# ifdef USE_AS_WMEMCHR
subl %edi, %ecx
sarl $2, %ecx
addq %rcx, %rax
# else
subq %rdi, %rax
# endif
# if USE_TERN_IN_LOOP
/* copy VMATCH to low ymm so we can use vpcmpeq which is not
encodable with EVEX registers. NB: this is VEC_SIZE == 32
only as there is no way to encode vpcmpeq with zmm0-15. */
vmovdqa64 %VMATCH, %VMATCH_LO
# endif
.p2align 4,, 11
L(loop_4x_vec):
/* Two versions of the loop. One that does not require
vzeroupper by not using ymmm0-15 and another does that
require vzeroupper because it uses ymmm0-15. The reason why
ymm0-15 is used at all is because there is no EVEX encoding
vpcmpeq and with vpcmpeq this loop can be performed more
efficiently. The non-vzeroupper version is safe for RTM
while the vzeroupper version should be preferred if RTM are
not supported. Which loop version we use is determined by
USE_TERN_IN_LOOP. */
# if USE_TERN_IN_LOOP
/* Since vptern can only take 3x vectors fastest to do 1 vec
separately with EVEX vpcmp. */
# ifdef USE_AS_WMEMCHR
/* vptern can only accept masks for epi32/epi64 so can only save
instruction using not equals mask on vptern with wmemchr.
*/
VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
# else
VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
# endif
/* Compare 3x with vpcmpeq and or them all together with vptern.
*/
VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
# ifdef USE_AS_WMEMCHR
/* This takes the not of or between VEC_lo(2), VEC_lo(3),
VEC_lo(4) as well as combines result from VEC(0) with zero
mask. */
vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
vpmovmskb %VMM_lo(4), %VRCX
# else
/* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
VEC_lo(4). */
vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
vpmovmskb %VMM_lo(4), %VRCX
KMOV %k1, %edx
# endif
# else
/* Loop version that uses EVEX encoding. */
VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
vpxorq (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
vpxorq (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3
VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z}
VPTESTN %VMM(3), %VMM(3), %k2
# endif
TEST_END ()
jnz L(loop_vec_ret)
subq $-(VEC_SIZE * 4), %rdi
subq $(CHAR_PER_VEC * 4), %rax
jae L(loop_4x_vec)
/* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
*/
COND_VZEROUPPER
.p2align 4,, 10
L(last_4x_vec):
/* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
instructions on eax from here on out. */
# if CHAR_PER_VEC != 64
andl $(CHAR_PER_VEC * 4 - 1), %eax
# endif
VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0
subq $(VEC_SIZE * 1), %rdi
KMOV %k0, %VRDX
cmpb $(CHAR_PER_VEC * 2 - 1), %al
jbe L(last_2x_vec)
test %VRDX, %VRDX
jnz L(last_vec_x1_novzero)
VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
KMOV %k0, %VRDX
test %VRDX, %VRDX
jnz L(last_vec_x2_novzero)
VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(first_vec_x3_check)
subb $(CHAR_PER_VEC * 3), %al
jae L(last_vec_check)
xorl %eax, %eax
ret
# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
L(last_vec_x2_novzero):
addq $VEC_SIZE, %rdi
L(last_vec_x1_novzero):
bsf %VRDX, %VRDX
leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
ret
# endif
# if CHAR_PER_VEC == 64
/* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
64 it needs a separate return label. */
.p2align 4,, 4
L(last_vec_x2):
L(last_vec_x2_novzero):
bsf %VRDX, %VRDX
leaq (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
ret
# endif
.p2align 4,, 4
L(loop_vec_ret):
# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
KMOV %k1, %VRAX
inc %MASK_GPR(rax)
# else
test %VRDX, %VRDX
# endif
jnz L(last_vec_x0)
# if USE_TERN_IN_LOOP
vpmovmskb %VMM_lo(2), %VRDX
# else
VPTESTN %VMM(2), %VMM(2), %k1
KMOV %k1, %VRDX
# endif
test %VRDX, %VRDX
jnz L(last_vec_x1)
# if USE_TERN_IN_LOOP
vpmovmskb %VMM_lo(3), %VRDX
# else
KMOV %k2, %VRDX
# endif
/* No longer need any of the lo vecs (ymm0-15) so vzeroupper
(only if used VEX encoded loop). */
COND_VZEROUPPER
/* Separate logic for CHAR_PER_VEC == 64 vs the rest. For
CHAR_PER_VEC we test the last 2x VEC separately, for
CHAR_PER_VEC <= 32 we can combine the results from the 2x
VEC in a single GPR. */
# if CHAR_PER_VEC == 64
# if USE_TERN_IN_LOOP
# error "Unsupported"
# endif
/* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */
test %VRDX, %VRDX
jnz L(last_vec_x2)
KMOV %k3, %VRDX
# else
/* CHAR_PER_VEC <= 32 so we can combine the results from the
last 2x VEC. */
# if !USE_TERN_IN_LOOP
KMOV %k3, %VRCX
# endif
salq $(VEC_SIZE / TERN_CHAR_MULT), %rcx
addq %rcx, %rdx
# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
L(last_vec_x2_novzero):
# endif
# endif
bsf %rdx, %rdx
leaq (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
ret
.p2align 4,, 8
L(last_vec_x1):
COND_VZEROUPPER
# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
L(last_vec_x1_novzero):
# endif
bsf %VRDX, %VRDX
leaq (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
ret
.p2align 4,, 4
L(last_vec_x0):
COND_VZEROUPPER
bsf %VGPR(GPR_X0), %VGPR(GPR_X0)
# if GPR_X0_IS_RET
addq %rdi, %rax
# else
leaq (%rdi, %GPR_X0, CHAR_SIZE), %rax
# endif
ret
.p2align 4,, 6
L(page_cross):
/* Need to preserve eax to compute inbound bytes we are
checking. */
# ifdef USE_AS_WMEMCHR
movl %eax, %ecx
# else
xorl %ecx, %ecx
subl %eax, %ecx
# endif
xorq %rdi, %rax
VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
KMOV %k0, %VRAX
# ifdef USE_AS_WMEMCHR
/* NB: Divide by CHAR_SIZE to shift out out of bounds bytes. */
shrl $2, %ecx
andl $(CHAR_PER_VEC - 1), %ecx
# endif
shrx %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
# ifdef USE_AS_WMEMCHR
negl %ecx
# endif
/* mask lower bits from ecx (negative eax) to get bytes till
next VEC. */
andl $(CHAR_PER_VEC - 1), %ecx
/* Check if VEC is entirely contained in the remainder of the
page. */
cmpq %rcx, %rdx
jbe L(page_cross_ret)
/* Length crosses the page so if rax is zero (no matches)
continue. */
test %VRAX, %VRAX
jz L(page_cross_continue)
/* if rdx > rcx then any match here must be in [buf:buf + len].
*/
tzcnt %VRAX, %VRAX
# ifdef USE_AS_WMEMCHR
leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
addq %rdi, %rax
# endif
ret
.p2align 4,, 2
L(page_cross_zero):
xorl %eax, %eax
ret
.p2align 4,, 4
L(page_cross_ret):
/* Search is entirely contained in page cross case. */
# ifdef USE_AS_WMEMCHR
test %VRAX, %VRAX
jz L(page_cross_zero)
# endif
tzcnt %VRAX, %VRAX
cmpl %eax, %edx
jbe L(page_cross_zero)
# ifdef USE_AS_WMEMCHR
leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
addq %rdi, %rax
# endif
ret
END (MEMCHR)
#endif