2021-03-05 14:24:52 +00:00
|
|
|
/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
|
2022-01-01 18:54:23 +00:00
|
|
|
Copyright (C) 2021-2022 Free Software Foundation, Inc.
|
2021-03-05 14:24:52 +00:00
|
|
|
This file is part of the GNU C Library.
|
|
|
|
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with the GNU C Library; if not, see
|
|
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
|
|
|
|
#if IS_IN (libc)
|
|
|
|
|
|
|
|
# include <sysdep.h>
|
|
|
|
|
|
|
|
# ifndef STRCMP
|
|
|
|
# define STRCMP __strcmp_evex
|
|
|
|
# endif
|
|
|
|
|
|
|
|
# define PAGE_SIZE 4096
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* VEC_SIZE = Number of bytes in a ymm register. */
|
2021-03-05 14:24:52 +00:00
|
|
|
# define VEC_SIZE 32
|
2022-01-10 21:35:39 +00:00
|
|
|
# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
# define VMOVU vmovdqu64
|
|
|
|
# define VMOVA vmovdqa64
|
2021-03-05 14:24:52 +00:00
|
|
|
|
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
# define TESTEQ subl $0xff,
|
|
|
|
/* Compare packed dwords. */
|
|
|
|
# define VPCMP vpcmpd
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# define VPMINU vpminud
|
|
|
|
# define VPTESTM vptestmd
|
2022-01-10 21:35:39 +00:00
|
|
|
/* 1 dword char == 4 bytes. */
|
2021-03-05 14:24:52 +00:00
|
|
|
# define SIZE_OF_CHAR 4
|
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
# define TESTEQ incl
|
|
|
|
/* Compare packed bytes. */
|
|
|
|
# define VPCMP vpcmpb
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# define VPMINU vpminub
|
|
|
|
# define VPTESTM vptestmb
|
2022-01-10 21:35:39 +00:00
|
|
|
/* 1 byte char == 1 byte. */
|
2021-03-05 14:24:52 +00:00
|
|
|
# define SIZE_OF_CHAR 1
|
|
|
|
# endif
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
# define LOOP_REG r9d
|
|
|
|
# define LOOP_REG64 r9
|
|
|
|
|
|
|
|
# define OFFSET_REG8 r9b
|
|
|
|
# define OFFSET_REG r9d
|
|
|
|
# define OFFSET_REG64 r9
|
|
|
|
# else
|
|
|
|
# define LOOP_REG edx
|
|
|
|
# define LOOP_REG64 rdx
|
|
|
|
|
|
|
|
# define OFFSET_REG8 dl
|
|
|
|
# define OFFSET_REG edx
|
|
|
|
# define OFFSET_REG64 rdx
|
|
|
|
# endif
|
|
|
|
|
|
|
|
# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
|
|
|
|
# define VEC_OFFSET 0
|
|
|
|
# else
|
|
|
|
# define VEC_OFFSET (-VEC_SIZE)
|
|
|
|
# endif
|
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
# define XMMZERO xmm16
|
2022-01-10 21:35:39 +00:00
|
|
|
# define XMM0 xmm17
|
|
|
|
# define XMM1 xmm18
|
2021-03-05 14:24:52 +00:00
|
|
|
|
|
|
|
# define YMMZERO ymm16
|
2022-01-10 21:35:39 +00:00
|
|
|
# define YMM0 ymm17
|
|
|
|
# define YMM1 ymm18
|
|
|
|
# define YMM2 ymm19
|
|
|
|
# define YMM3 ymm20
|
|
|
|
# define YMM4 ymm21
|
|
|
|
# define YMM5 ymm22
|
|
|
|
# define YMM6 ymm23
|
|
|
|
# define YMM7 ymm24
|
|
|
|
# define YMM8 ymm25
|
|
|
|
# define YMM9 ymm26
|
|
|
|
# define YMM10 ymm27
|
2021-03-05 14:24:52 +00:00
|
|
|
|
|
|
|
/* Warning!
|
|
|
|
wcscmp/wcsncmp have to use SIGNED comparison for elements.
|
|
|
|
strcmp/strncmp have to use UNSIGNED comparison for elements.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* The main idea of the string comparison (byte or dword) using 256-bit
|
|
|
|
EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
|
|
|
|
latter can be on either packed bytes or dwords depending on
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
|
2021-03-05 14:24:52 +00:00
|
|
|
matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
|
|
|
|
KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
|
|
|
|
are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
|
|
|
|
instructions. Main loop (away from from page boundary) compares 4
|
|
|
|
vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
|
|
|
|
bytes) on each loop.
|
|
|
|
|
|
|
|
The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
|
|
|
|
is the same as strcmp, except that an a maximum offset is tracked. If
|
|
|
|
the maximum offset is reached before a difference is found, zero is
|
|
|
|
returned. */
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
.section .text.evex, "ax", @progbits
|
|
|
|
ENTRY(STRCMP)
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
# ifdef __ILP32__
|
|
|
|
/* Clear the upper 32 bits. */
|
2022-02-04 19:11:08 +00:00
|
|
|
movl %edx, %edx
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
cmp $1, %RDX_LP
|
|
|
|
/* Signed comparison intentional. We use this branch to also
|
|
|
|
test cases where length >= 2^63. These very large sizes can be
|
|
|
|
handled with strcmp as there is no way for that length to
|
|
|
|
actually bound the buffer. */
|
|
|
|
jle L(one_or_less)
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
|
|
|
movl %edi, %eax
|
|
|
|
orl %esi, %eax
|
2022-01-10 21:35:39 +00:00
|
|
|
/* Shift out the bits irrelivant to page boundary ([63:12]). */
|
|
|
|
sall $20, %eax
|
|
|
|
/* Check if s1 or s2 may cross a page in next 4x VEC loads. */
|
|
|
|
cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
|
|
|
|
ja L(page_cross)
|
|
|
|
|
|
|
|
L(no_page_cross):
|
|
|
|
/* Safe to compare 4x vectors. */
|
2021-03-05 14:24:52 +00:00
|
|
|
VMOVU (%rdi), %YMM0
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
VPTESTM %YMM0, %YMM0, %k2
|
|
|
|
/* Each bit cleared in K1 represents a mismatch or a null CHAR
|
|
|
|
in YMM0 and 32 bytes at (%rsi). */
|
|
|
|
VPCMP $0, (%rsi), %YMM0, %k1{%k2}
|
2021-03-05 14:24:52 +00:00
|
|
|
kmovd %k1, %ecx
|
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpq $CHAR_PER_VEC, %rdx
|
|
|
|
jbe L(vec_0_test_len)
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
|
|
|
|
/* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
|
|
|
|
wcscmp/wcsncmp. */
|
|
|
|
|
|
|
|
/* All 1s represents all equals. TESTEQ will overflow to zero in
|
|
|
|
all equals case. Otherwise 1s will carry until position of first
|
|
|
|
mismatch. */
|
|
|
|
TESTEQ %ecx
|
|
|
|
jz L(more_3x_vec)
|
|
|
|
|
|
|
|
.p2align 4,, 4
|
|
|
|
L(return_vec_0):
|
|
|
|
tzcntl %ecx, %ecx
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
|
2021-03-05 14:24:52 +00:00
|
|
|
xorl %eax, %eax
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
|
|
|
|
je L(ret0)
|
2021-03-05 14:24:52 +00:00
|
|
|
setl %al
|
|
|
|
negl %eax
|
|
|
|
orl $1, %eax
|
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
movzbl (%rdi, %rcx), %eax
|
|
|
|
movzbl (%rsi, %rcx), %ecx
|
|
|
|
subl %ecx, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
L(ret0):
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
|
|
|
|
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
.p2align 4,, 4
|
|
|
|
L(vec_0_test_len):
|
|
|
|
notl %ecx
|
|
|
|
bzhil %edx, %ecx, %eax
|
|
|
|
jnz L(return_vec_0)
|
|
|
|
/* Align if will cross fetch block. */
|
|
|
|
.p2align 4,, 2
|
|
|
|
L(ret_zero):
|
2021-03-05 14:24:52 +00:00
|
|
|
xorl %eax, %eax
|
2022-01-10 21:35:39 +00:00
|
|
|
ret
|
|
|
|
|
|
|
|
.p2align 4,, 5
|
|
|
|
L(one_or_less):
|
|
|
|
jb L(ret_zero)
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
/* 'nbe' covers the case where length is negative (large
|
|
|
|
unsigned). */
|
|
|
|
jnbe __wcscmp_evex
|
|
|
|
movl (%rdi), %edx
|
2021-03-05 14:24:52 +00:00
|
|
|
xorl %eax, %eax
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpl (%rsi), %edx
|
|
|
|
je L(ret1)
|
|
|
|
setl %al
|
|
|
|
negl %eax
|
|
|
|
orl $1, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
/* 'nbe' covers the case where length is negative (large
|
|
|
|
unsigned). */
|
|
|
|
jnbe __strcmp_evex
|
|
|
|
movzbl (%rdi), %eax
|
|
|
|
movzbl (%rsi), %ecx
|
|
|
|
subl %ecx, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
L(ret1):
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
2022-01-10 21:35:39 +00:00
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
.p2align 4,, 10
|
|
|
|
L(return_vec_1):
|
|
|
|
tzcntl %ecx, %ecx
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
/* rdx must be > CHAR_PER_VEC so its safe to subtract without
|
|
|
|
worrying about underflow. */
|
|
|
|
addq $-CHAR_PER_VEC, %rdx
|
|
|
|
cmpq %rcx, %rdx
|
|
|
|
jbe L(ret_zero)
|
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
|
|
|
|
xorl %eax, %eax
|
|
|
|
cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
|
|
|
|
je L(ret2)
|
|
|
|
setl %al
|
|
|
|
negl %eax
|
|
|
|
orl $1, %eax
|
|
|
|
# else
|
|
|
|
movzbl VEC_SIZE(%rdi, %rcx), %eax
|
|
|
|
movzbl VEC_SIZE(%rsi, %rcx), %ecx
|
|
|
|
subl %ecx, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
L(ret2):
|
|
|
|
ret
|
|
|
|
|
|
|
|
.p2align 4,, 10
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
L(return_vec_3):
|
|
|
|
# if CHAR_PER_VEC <= 16
|
|
|
|
sall $CHAR_PER_VEC, %ecx
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
salq $CHAR_PER_VEC, %rcx
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
# endif
|
|
|
|
L(return_vec_2):
|
|
|
|
# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
|
|
|
|
tzcntl %ecx, %ecx
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
tzcntq %rcx, %rcx
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
|
|
|
|
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpq %rcx, %rdx
|
|
|
|
jbe L(ret_zero)
|
|
|
|
# endif
|
|
|
|
|
|
|
|
# ifdef USE_AS_WCSCMP
|
|
|
|
movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
|
2021-03-05 14:24:52 +00:00
|
|
|
xorl %eax, %eax
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
|
|
|
|
je L(ret3)
|
|
|
|
setl %al
|
|
|
|
negl %eax
|
|
|
|
orl $1, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
|
|
|
|
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
|
|
|
|
subl %ecx, %eax
|
|
|
|
# endif
|
|
|
|
L(ret3):
|
|
|
|
ret
|
|
|
|
|
|
|
|
# ifndef USE_AS_STRNCMP
|
|
|
|
.p2align 4,, 10
|
|
|
|
L(return_vec_3):
|
|
|
|
tzcntl %ecx, %ecx
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
|
2021-03-05 14:24:52 +00:00
|
|
|
xorl %eax, %eax
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
|
|
|
|
je L(ret4)
|
|
|
|
setl %al
|
|
|
|
negl %eax
|
|
|
|
orl $1, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
|
|
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
|
|
|
|
subl %ecx, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
L(ret4):
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
2022-01-10 21:35:39 +00:00
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* 32 byte align here ensures the main loop is ideally aligned
|
|
|
|
for DSB. */
|
|
|
|
.p2align 5
|
|
|
|
L(more_3x_vec):
|
|
|
|
/* Safe to compare 4x vectors. */
|
|
|
|
VMOVU (VEC_SIZE)(%rdi), %YMM0
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
VPTESTM %YMM0, %YMM0, %k2
|
2022-01-10 21:35:39 +00:00
|
|
|
VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
kmovd %k1, %ecx
|
2022-01-10 21:35:39 +00:00
|
|
|
TESTEQ %ecx
|
|
|
|
jnz L(return_vec_1)
|
|
|
|
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
subq $(CHAR_PER_VEC * 2), %rdx
|
|
|
|
jbe L(ret_zero)
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
|
|
|
|
VPTESTM %YMM0, %YMM0, %k2
|
|
|
|
VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
|
|
|
|
kmovd %k1, %ecx
|
2022-01-10 21:35:39 +00:00
|
|
|
TESTEQ %ecx
|
|
|
|
jnz L(return_vec_2)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
|
|
|
|
VPTESTM %YMM0, %YMM0, %k2
|
|
|
|
VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
|
|
|
|
kmovd %k1, %ecx
|
2022-01-10 21:35:39 +00:00
|
|
|
TESTEQ %ecx
|
|
|
|
jnz L(return_vec_3)
|
|
|
|
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
cmpq $(CHAR_PER_VEC * 2), %rdx
|
|
|
|
jbe L(ret_zero)
|
|
|
|
# endif
|
|
|
|
|
|
|
|
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
/* any non-zero positive value that doesn't inference with 0x1.
|
|
|
|
*/
|
|
|
|
movl $2, %r8d
|
|
|
|
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
xorl %r8d, %r8d
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
|
|
|
|
/* The prepare labels are various entry points from the page
|
|
|
|
cross logic. */
|
|
|
|
L(prepare_loop):
|
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
|
|
|
L(prepare_loop_no_len):
|
|
|
|
movl %edi, %ecx
|
|
|
|
andl $(VEC_SIZE * 4 - 1), %ecx
|
|
|
|
shrl $2, %ecx
|
|
|
|
leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
|
|
|
|
# else
|
|
|
|
/* Store N + (VEC_SIZE * 4) and place check at the begining of
|
|
|
|
the loop. */
|
|
|
|
leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
|
|
|
|
L(prepare_loop_no_len):
|
|
|
|
# endif
|
|
|
|
# else
|
|
|
|
L(prepare_loop_no_len):
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* Align s1 and adjust s2 accordingly. */
|
|
|
|
subq %rdi, %rsi
|
|
|
|
andq $-(VEC_SIZE * 4), %rdi
|
|
|
|
L(prepare_loop_readj):
|
|
|
|
addq %rdi, %rsi
|
|
|
|
# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
|
|
|
|
subq %rdi, %rdx
|
|
|
|
# endif
|
|
|
|
|
|
|
|
L(prepare_loop_aligned):
|
|
|
|
/* eax stores distance from rsi to next page cross. These cases
|
|
|
|
need to be handled specially as the 4x loop could potentially
|
|
|
|
read memory past the length of s1 or s2 and across a page
|
|
|
|
boundary. */
|
|
|
|
movl $-(VEC_SIZE * 4), %eax
|
|
|
|
subl %esi, %eax
|
|
|
|
andl $(PAGE_SIZE - 1), %eax
|
|
|
|
|
|
|
|
vpxorq %YMMZERO, %YMMZERO, %YMMZERO
|
|
|
|
|
|
|
|
/* Loop 4x comparisons at a time. */
|
2021-03-05 14:24:52 +00:00
|
|
|
.p2align 4
|
|
|
|
L(loop):
|
2022-01-10 21:35:39 +00:00
|
|
|
|
|
|
|
/* End condition for strncmp. */
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
subq $(CHAR_PER_VEC * 4), %rdx
|
|
|
|
jbe L(ret_zero)
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
|
|
|
|
subq $-(VEC_SIZE * 4), %rdi
|
|
|
|
subq $-(VEC_SIZE * 4), %rsi
|
|
|
|
|
|
|
|
/* Check if rsi loads will cross a page boundary. */
|
|
|
|
addl $-(VEC_SIZE * 4), %eax
|
|
|
|
jnb L(page_cross_during_loop)
|
|
|
|
|
|
|
|
/* Loop entry after handling page cross during loop. */
|
|
|
|
L(loop_skip_page_cross_check):
|
|
|
|
VMOVA (VEC_SIZE * 0)(%rdi), %YMM0
|
|
|
|
VMOVA (VEC_SIZE * 1)(%rdi), %YMM2
|
|
|
|
VMOVA (VEC_SIZE * 2)(%rdi), %YMM4
|
|
|
|
VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
|
|
|
VPMINU %YMM0, %YMM2, %YMM8
|
|
|
|
VPMINU %YMM4, %YMM6, %YMM9
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* A zero CHAR in YMM9 means that there is a null CHAR. */
|
|
|
|
VPMINU %YMM8, %YMM9, %YMM9
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
|
|
|
/* Each bit set in K1 represents a non-null CHAR in YMM8. */
|
2022-01-10 21:35:39 +00:00
|
|
|
VPTESTM %YMM9, %YMM9, %k1
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
|
|
|
|
vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
|
|
|
|
vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
|
|
|
|
/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
|
|
|
|
oring with YMM1. Result is stored in YMM6. */
|
|
|
|
vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* Or together YMM3, YMM5, and YMM6. */
|
|
|
|
vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* A non-zero CHAR in YMM6 represents a mismatch. */
|
|
|
|
VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
|
|
|
|
kmovd %k0, %LOOP_REG
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
TESTEQ %LOOP_REG
|
|
|
|
jz L(loop)
|
|
|
|
|
|
|
|
|
|
|
|
/* Find which VEC has the mismatch of end of string. */
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
VPTESTM %YMM0, %YMM0, %k1
|
|
|
|
VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
|
|
|
|
kmovd %k0, %ecx
|
2022-01-10 21:35:39 +00:00
|
|
|
TESTEQ %ecx
|
|
|
|
jnz L(return_vec_0_end)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
VPTESTM %YMM2, %YMM2, %k1
|
|
|
|
VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
|
|
|
|
kmovd %k0, %ecx
|
2022-01-10 21:35:39 +00:00
|
|
|
TESTEQ %ecx
|
|
|
|
jnz L(return_vec_1_end)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
|
|
|
|
/* Handle VEC 2 and 3 without branches. */
|
|
|
|
L(return_vec_2_3_end):
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
subq $(CHAR_PER_VEC * 2), %rdx
|
|
|
|
jbe L(ret_zero_end)
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
VPTESTM %YMM4, %YMM4, %k1
|
|
|
|
VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
|
|
|
|
kmovd %k0, %ecx
|
2022-01-10 21:35:39 +00:00
|
|
|
TESTEQ %ecx
|
|
|
|
# if CHAR_PER_VEC <= 16
|
|
|
|
sall $CHAR_PER_VEC, %LOOP_REG
|
|
|
|
orl %ecx, %LOOP_REG
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
salq $CHAR_PER_VEC, %LOOP_REG64
|
|
|
|
orq %rcx, %LOOP_REG64
|
|
|
|
# endif
|
|
|
|
L(return_vec_3_end):
|
|
|
|
/* LOOP_REG contains matches for null/mismatch from the loop. If
|
|
|
|
VEC 0,1,and 2 all have no null and no mismatches then mismatch
|
|
|
|
must entirely be from VEC 3 which is fully represented by
|
|
|
|
LOOP_REG. */
|
|
|
|
# if CHAR_PER_VEC <= 16
|
|
|
|
tzcntl %LOOP_REG, %LOOP_REG
|
|
|
|
# else
|
|
|
|
tzcntq %LOOP_REG64, %LOOP_REG64
|
|
|
|
# endif
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
cmpq %LOOP_REG64, %rdx
|
|
|
|
jbe L(ret_zero_end)
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
|
|
|
|
xorl %eax, %eax
|
|
|
|
cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
|
|
|
|
je L(ret5)
|
|
|
|
setl %al
|
|
|
|
negl %eax
|
|
|
|
xorl %r8d, %eax
|
|
|
|
# else
|
|
|
|
movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
|
|
|
|
movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
|
|
|
|
subl %ecx, %eax
|
|
|
|
xorl %r8d, %eax
|
|
|
|
subl %r8d, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
L(ret5):
|
|
|
|
ret
|
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
.p2align 4,, 2
|
|
|
|
L(ret_zero_end):
|
2021-03-05 14:24:52 +00:00
|
|
|
xorl %eax, %eax
|
2022-01-10 21:35:39 +00:00
|
|
|
ret
|
|
|
|
# endif
|
|
|
|
|
|
|
|
|
|
|
|
/* The L(return_vec_N_end) differ from L(return_vec_N) in that
|
|
|
|
they use the value of `r8` to negate the return value. This is
|
|
|
|
because the page cross logic can swap `rdi` and `rsi`. */
|
|
|
|
.p2align 4,, 10
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
L(return_vec_1_end):
|
|
|
|
# if CHAR_PER_VEC <= 16
|
|
|
|
sall $CHAR_PER_VEC, %ecx
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
salq $CHAR_PER_VEC, %rcx
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
# endif
|
|
|
|
L(return_vec_0_end):
|
|
|
|
# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
|
|
|
|
tzcntl %ecx, %ecx
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
tzcntq %rcx, %rcx
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
|
|
|
|
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpq %rcx, %rdx
|
|
|
|
jbe L(ret_zero_end)
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
|
|
|
|
xorl %eax, %eax
|
|
|
|
cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
|
|
|
|
je L(ret6)
|
|
|
|
setl %al
|
|
|
|
negl %eax
|
|
|
|
/* This is the non-zero case for `eax` so just xorl with `r8d`
|
|
|
|
flip is `rdi` and `rsi` where swapped. */
|
|
|
|
xorl %r8d, %eax
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
movzbl (%rdi, %rcx), %eax
|
|
|
|
movzbl (%rsi, %rcx), %ecx
|
|
|
|
subl %ecx, %eax
|
|
|
|
/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
|
|
|
|
logic. Subtract `r8d` after xor for zero case. */
|
|
|
|
xorl %r8d, %eax
|
|
|
|
subl %r8d, %eax
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
L(ret6):
|
|
|
|
ret
|
|
|
|
|
|
|
|
# ifndef USE_AS_STRNCMP
|
|
|
|
.p2align 4,, 10
|
|
|
|
L(return_vec_1_end):
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
tzcntl %ecx, %ecx
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
|
2021-03-05 14:24:52 +00:00
|
|
|
xorl %eax, %eax
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
|
|
|
|
je L(ret7)
|
|
|
|
setl %al
|
|
|
|
negl %eax
|
|
|
|
xorl %r8d, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
movzbl VEC_SIZE(%rdi, %rcx), %eax
|
|
|
|
movzbl VEC_SIZE(%rsi, %rcx), %ecx
|
|
|
|
subl %ecx, %eax
|
|
|
|
xorl %r8d, %eax
|
|
|
|
subl %r8d, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
L(ret7):
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
|
|
|
# endif
|
|
|
|
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* Page cross in rsi in next 4x VEC. */
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* TODO: Improve logic here. */
|
|
|
|
.p2align 4,, 10
|
|
|
|
L(page_cross_during_loop):
|
|
|
|
/* eax contains [distance_from_page - (VEC_SIZE * 4)]. */
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* Optimistically rsi and rdi and both aligned in which case we
|
|
|
|
don't need any logic here. */
|
|
|
|
cmpl $-(VEC_SIZE * 4), %eax
|
|
|
|
/* Don't adjust eax before jumping back to loop and we will
|
|
|
|
never hit page cross case again. */
|
|
|
|
je L(loop_skip_page_cross_check)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* Check if we can safely load a VEC. */
|
|
|
|
cmpl $-(VEC_SIZE * 3), %eax
|
|
|
|
jle L(less_1x_vec_till_page_cross)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
VMOVA (%rdi), %YMM0
|
|
|
|
VPTESTM %YMM0, %YMM0, %k2
|
|
|
|
VPCMP $0, (%rsi), %YMM0, %k1{%k2}
|
|
|
|
kmovd %k1, %ecx
|
|
|
|
TESTEQ %ecx
|
|
|
|
jnz L(return_vec_0_end)
|
|
|
|
|
|
|
|
/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */
|
|
|
|
cmpl $-(VEC_SIZE * 2), %eax
|
|
|
|
jg L(more_2x_vec_till_page_cross)
|
|
|
|
|
|
|
|
.p2align 4,, 4
|
|
|
|
L(less_1x_vec_till_page_cross):
|
|
|
|
subl $-(VEC_SIZE * 4), %eax
|
|
|
|
/* Guranteed safe to read from rdi - VEC_SIZE here. The only
|
|
|
|
concerning case is first iteration if incoming s1 was near start
|
|
|
|
of a page and s2 near end. If s1 was near the start of the page
|
|
|
|
we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
|
|
|
|
to read back -VEC_SIZE. If rdi is truly at the start of a page
|
|
|
|
here, it means the previous page (rdi - VEC_SIZE) has already
|
|
|
|
been loaded earlier so must be valid. */
|
|
|
|
VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
|
|
|
|
VPTESTM %YMM0, %YMM0, %k2
|
|
|
|
VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
|
|
|
|
|
|
|
|
/* Mask of potentially valid bits. The lower bits can be out of
|
|
|
|
range comparisons (but safe regarding page crosses). */
|
2021-03-05 14:24:52 +00:00
|
|
|
|
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
movl $-1, %r10d
|
|
|
|
movl %esi, %ecx
|
|
|
|
andl $(VEC_SIZE - 1), %ecx
|
|
|
|
shrl $2, %ecx
|
|
|
|
shlxl %ecx, %r10d, %ecx
|
|
|
|
movzbl %cl, %r10d
|
|
|
|
# else
|
|
|
|
movl $-1, %ecx
|
|
|
|
shlxl %esi, %ecx, %r10d
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
|
|
|
|
kmovd %k1, %ecx
|
|
|
|
notl %ecx
|
|
|
|
|
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
movl %eax, %r11d
|
|
|
|
shrl $2, %r11d
|
|
|
|
cmpq %r11, %rdx
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpq %rax, %rdx
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
jbe L(return_page_cross_end_check)
|
|
|
|
# endif
|
|
|
|
movl %eax, %OFFSET_REG
|
|
|
|
|
|
|
|
/* Readjust eax before potentially returning to the loop. */
|
|
|
|
addl $(PAGE_SIZE - VEC_SIZE * 4), %eax
|
|
|
|
|
|
|
|
andl %r10d, %ecx
|
|
|
|
jz L(loop_skip_page_cross_check)
|
|
|
|
|
|
|
|
.p2align 4,, 3
|
|
|
|
L(return_page_cross_end):
|
|
|
|
tzcntl %ecx, %ecx
|
|
|
|
|
|
|
|
# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
|
|
|
|
leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
|
|
|
|
L(return_page_cross_cmp_mem):
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
addl %OFFSET_REG, %ecx
|
|
|
|
# endif
|
|
|
|
# ifdef USE_AS_WCSCMP
|
|
|
|
movl VEC_OFFSET(%rdi, %rcx), %edx
|
2021-03-05 14:24:52 +00:00
|
|
|
xorl %eax, %eax
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpl VEC_OFFSET(%rsi, %rcx), %edx
|
|
|
|
je L(ret8)
|
|
|
|
setl %al
|
|
|
|
negl %eax
|
|
|
|
xorl %r8d, %eax
|
|
|
|
# else
|
|
|
|
movzbl VEC_OFFSET(%rdi, %rcx), %eax
|
|
|
|
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
|
|
|
|
subl %ecx, %eax
|
|
|
|
xorl %r8d, %eax
|
|
|
|
subl %r8d, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
L(ret8):
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
.p2align 4,, 10
|
|
|
|
L(return_page_cross_end_check):
|
2022-02-16 02:27:21 +00:00
|
|
|
andl %r10d, %ecx
|
2022-01-10 21:35:39 +00:00
|
|
|
tzcntl %ecx, %ecx
|
|
|
|
leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
|
|
|
|
# ifdef USE_AS_WCSCMP
|
|
|
|
sall $2, %edx
|
|
|
|
# endif
|
|
|
|
cmpl %ecx, %edx
|
|
|
|
ja L(return_page_cross_cmp_mem)
|
|
|
|
xorl %eax, %eax
|
|
|
|
ret
|
|
|
|
# endif
|
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
.p2align 4,, 10
|
|
|
|
L(more_2x_vec_till_page_cross):
|
|
|
|
/* If more 2x vec till cross we will complete a full loop
|
|
|
|
iteration here. */
|
|
|
|
|
|
|
|
VMOVA VEC_SIZE(%rdi), %YMM0
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
VPTESTM %YMM0, %YMM0, %k2
|
2022-01-10 21:35:39 +00:00
|
|
|
VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
|
|
|
|
kmovd %k1, %ecx
|
|
|
|
TESTEQ %ecx
|
|
|
|
jnz L(return_vec_1_end)
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
cmpq $(CHAR_PER_VEC * 2), %rdx
|
|
|
|
jbe L(ret_zero_in_loop_page_cross)
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
subl $-(VEC_SIZE * 4), %eax
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* Safe to include comparisons from lower bytes. */
|
|
|
|
VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
|
|
|
|
VPTESTM %YMM0, %YMM0, %k2
|
|
|
|
VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
|
|
|
|
kmovd %k1, %ecx
|
|
|
|
TESTEQ %ecx
|
|
|
|
jnz L(return_vec_page_cross_0)
|
|
|
|
|
|
|
|
VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
|
|
|
|
VPTESTM %YMM0, %YMM0, %k2
|
|
|
|
VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
|
|
|
|
kmovd %k1, %ecx
|
|
|
|
TESTEQ %ecx
|
|
|
|
jnz L(return_vec_page_cross_1)
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
/* Must check length here as length might proclude reading next
|
|
|
|
page. */
|
|
|
|
# ifdef USE_AS_WCSCMP
|
|
|
|
movl %eax, %r11d
|
|
|
|
shrl $2, %r11d
|
|
|
|
cmpq %r11, %rdx
|
|
|
|
# else
|
|
|
|
cmpq %rax, %rdx
|
|
|
|
# endif
|
|
|
|
jbe L(ret_zero_in_loop_page_cross)
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* Finish the loop. */
|
|
|
|
VMOVA (VEC_SIZE * 2)(%rdi), %YMM4
|
|
|
|
VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
|
|
|
|
VPMINU %YMM4, %YMM6, %YMM9
|
|
|
|
VPTESTM %YMM9, %YMM9, %k1
|
|
|
|
|
|
|
|
vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
|
|
|
|
/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
|
|
|
|
vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
|
|
|
|
|
|
|
|
VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
|
|
|
|
kmovd %k0, %LOOP_REG
|
|
|
|
TESTEQ %LOOP_REG
|
|
|
|
jnz L(return_vec_2_3_end)
|
|
|
|
|
|
|
|
/* Best for code size to include ucond-jmp here. Would be faster
|
|
|
|
if this case is hot to duplicate the L(return_vec_2_3_end) code
|
|
|
|
as fall-through and have jump back to loop on mismatch
|
|
|
|
comparison. */
|
|
|
|
subq $-(VEC_SIZE * 4), %rdi
|
|
|
|
subq $-(VEC_SIZE * 4), %rsi
|
|
|
|
addl $(PAGE_SIZE - VEC_SIZE * 8), %eax
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
subq $(CHAR_PER_VEC * 4), %rdx
|
|
|
|
ja L(loop_skip_page_cross_check)
|
|
|
|
L(ret_zero_in_loop_page_cross):
|
|
|
|
xorl %eax, %eax
|
|
|
|
ret
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
jmp L(loop_skip_page_cross_check)
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
|
|
|
|
.p2align 4,, 10
|
|
|
|
L(return_vec_page_cross_0):
|
|
|
|
addl $-VEC_SIZE, %eax
|
|
|
|
L(return_vec_page_cross_1):
|
|
|
|
tzcntl %ecx, %ecx
|
|
|
|
# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
|
|
|
|
leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
# ifdef USE_AS_WCSCMP
|
|
|
|
/* Must divide ecx instead of multiply rdx due to overflow. */
|
|
|
|
movl %ecx, %eax
|
|
|
|
shrl $2, %eax
|
|
|
|
cmpq %rax, %rdx
|
|
|
|
# else
|
|
|
|
cmpq %rcx, %rdx
|
|
|
|
# endif
|
|
|
|
jbe L(ret_zero_in_loop_page_cross)
|
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
addl %eax, %ecx
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
movl VEC_OFFSET(%rdi, %rcx), %edx
|
2021-03-05 14:24:52 +00:00
|
|
|
xorl %eax, %eax
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpl VEC_OFFSET(%rsi, %rcx), %edx
|
|
|
|
je L(ret9)
|
|
|
|
setl %al
|
|
|
|
negl %eax
|
|
|
|
xorl %r8d, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
movzbl VEC_OFFSET(%rdi, %rcx), %eax
|
|
|
|
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
|
|
|
|
subl %ecx, %eax
|
|
|
|
xorl %r8d, %eax
|
|
|
|
subl %r8d, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
L(ret9):
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
|
|
|
|
.p2align 4,, 10
|
|
|
|
L(page_cross):
|
|
|
|
# ifndef USE_AS_STRNCMP
|
|
|
|
/* If both are VEC aligned we don't need any special logic here.
|
|
|
|
Only valid for strcmp where stop condition is guranteed to be
|
|
|
|
reachable by just reading memory. */
|
|
|
|
testl $((VEC_SIZE - 1) << 20), %eax
|
|
|
|
jz L(no_page_cross)
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
movl %edi, %eax
|
|
|
|
movl %esi, %ecx
|
|
|
|
andl $(PAGE_SIZE - 1), %eax
|
|
|
|
andl $(PAGE_SIZE - 1), %ecx
|
|
|
|
|
|
|
|
xorl %OFFSET_REG, %OFFSET_REG
|
|
|
|
|
|
|
|
/* Check which is closer to page cross, s1 or s2. */
|
|
|
|
cmpl %eax, %ecx
|
|
|
|
jg L(page_cross_s2)
|
|
|
|
|
|
|
|
/* The previous page cross check has false positives. Check for
|
|
|
|
true positive as page cross logic is very expensive. */
|
|
|
|
subl $(PAGE_SIZE - VEC_SIZE * 4), %eax
|
|
|
|
jbe L(no_page_cross)
|
|
|
|
|
|
|
|
|
|
|
|
/* Set r8 to not interfere with normal return value (rdi and rsi
|
|
|
|
did not swap). */
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
/* any non-zero positive value that doesn't inference with 0x1.
|
|
|
|
*/
|
|
|
|
movl $2, %r8d
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
xorl %r8d, %r8d
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
|
|
|
|
/* Check if less than 1x VEC till page cross. */
|
|
|
|
subl $(VEC_SIZE * 3), %eax
|
|
|
|
jg L(less_1x_vec_till_page)
|
|
|
|
|
|
|
|
|
|
|
|
/* If more than 1x VEC till page cross, loop throuh safely
|
|
|
|
loadable memory until within 1x VEC of page cross. */
|
|
|
|
.p2align 4,, 8
|
|
|
|
L(page_cross_loop):
|
|
|
|
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
|
|
|
|
VPTESTM %YMM0, %YMM0, %k2
|
|
|
|
VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
|
|
|
|
kmovd %k1, %ecx
|
|
|
|
TESTEQ %ecx
|
|
|
|
jnz L(check_ret_vec_page_cross)
|
|
|
|
addl $CHAR_PER_VEC, %OFFSET_REG
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpq %OFFSET_REG64, %rdx
|
|
|
|
jbe L(ret_zero_page_cross)
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
addl $VEC_SIZE, %eax
|
|
|
|
jl L(page_cross_loop)
|
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
shrl $2, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
subl %eax, %OFFSET_REG
|
|
|
|
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
|
|
|
|
to not cross page so is safe to load. Since we have already
|
|
|
|
loaded at least 1 VEC from rsi it is also guranteed to be safe.
|
|
|
|
*/
|
|
|
|
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
|
|
|
|
VPTESTM %YMM0, %YMM0, %k2
|
|
|
|
VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
|
|
|
|
|
|
|
|
kmovd %k1, %ecx
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
leal CHAR_PER_VEC(%OFFSET_REG64), %eax
|
|
|
|
cmpq %rax, %rdx
|
|
|
|
jbe L(check_ret_vec_page_cross2)
|
|
|
|
# ifdef USE_AS_WCSCMP
|
|
|
|
addq $-(CHAR_PER_VEC * 2), %rdx
|
|
|
|
# else
|
|
|
|
addq %rdi, %rdx
|
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
TESTEQ %ecx
|
|
|
|
jz L(prepare_loop_no_len)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
.p2align 4,, 4
|
|
|
|
L(ret_vec_page_cross):
|
|
|
|
# ifndef USE_AS_STRNCMP
|
|
|
|
L(check_ret_vec_page_cross):
|
|
|
|
# endif
|
|
|
|
tzcntl %ecx, %ecx
|
|
|
|
addl %OFFSET_REG, %ecx
|
|
|
|
L(ret_vec_page_cross_cont):
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
|
|
|
|
xorl %eax, %eax
|
|
|
|
cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
|
|
|
|
je L(ret12)
|
2021-03-05 14:24:52 +00:00
|
|
|
setl %al
|
|
|
|
negl %eax
|
2022-01-10 21:35:39 +00:00
|
|
|
xorl %r8d, %eax
|
|
|
|
# else
|
|
|
|
movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
|
|
|
|
movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
|
|
|
|
subl %ecx, %eax
|
|
|
|
xorl %r8d, %eax
|
|
|
|
subl %r8d, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
L(ret12):
|
|
|
|
ret
|
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
|
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
.p2align 4,, 10
|
|
|
|
L(check_ret_vec_page_cross2):
|
|
|
|
TESTEQ %ecx
|
|
|
|
L(check_ret_vec_page_cross):
|
|
|
|
tzcntl %ecx, %ecx
|
|
|
|
addl %OFFSET_REG, %ecx
|
|
|
|
cmpq %rcx, %rdx
|
|
|
|
ja L(ret_vec_page_cross_cont)
|
|
|
|
.p2align 4,, 2
|
|
|
|
L(ret_zero_page_cross):
|
2021-03-05 14:24:52 +00:00
|
|
|
xorl %eax, %eax
|
|
|
|
ret
|
2022-01-10 21:35:39 +00:00
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
.p2align 4,, 4
|
|
|
|
L(page_cross_s2):
|
|
|
|
/* Ensure this is a true page cross. */
|
|
|
|
subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx
|
|
|
|
jbe L(no_page_cross)
|
|
|
|
|
|
|
|
|
|
|
|
movl %ecx, %eax
|
|
|
|
movq %rdi, %rcx
|
|
|
|
movq %rsi, %rdi
|
|
|
|
movq %rcx, %rsi
|
|
|
|
|
|
|
|
/* set r8 to negate return value as rdi and rsi swapped. */
|
|
|
|
# ifdef USE_AS_WCSCMP
|
|
|
|
movl $-4, %r8d
|
|
|
|
# else
|
|
|
|
movl $-1, %r8d
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
xorl %OFFSET_REG, %OFFSET_REG
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
/* Check if more than 1x VEC till page cross. */
|
|
|
|
subl $(VEC_SIZE * 3), %eax
|
|
|
|
jle L(page_cross_loop)
|
|
|
|
|
|
|
|
.p2align 4,, 6
|
|
|
|
L(less_1x_vec_till_page):
|
|
|
|
# ifdef USE_AS_WCSCMP
|
|
|
|
shrl $2, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
/* Find largest load size we can use. */
|
|
|
|
cmpl $(16 / SIZE_OF_CHAR), %eax
|
|
|
|
ja L(less_16_till_page)
|
|
|
|
|
|
|
|
/* Use 16 byte comparison. */
|
|
|
|
vmovdqu (%rdi), %xmm0
|
|
|
|
VPTESTM %xmm0, %xmm0, %k2
|
|
|
|
VPCMP $0, (%rsi), %xmm0, %k1{%k2}
|
|
|
|
kmovd %k1, %ecx
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
subl $0xf, %ecx
|
|
|
|
# else
|
|
|
|
incw %cx
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
jnz L(check_ret_vec_page_cross)
|
|
|
|
movl $(16 / SIZE_OF_CHAR), %OFFSET_REG
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpq %OFFSET_REG64, %rdx
|
|
|
|
jbe L(ret_zero_page_cross_slow_case0)
|
|
|
|
subl %eax, %OFFSET_REG
|
|
|
|
# else
|
|
|
|
/* Explicit check for 16 byte alignment. */
|
|
|
|
subl %eax, %OFFSET_REG
|
|
|
|
jz L(prepare_loop)
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
|
|
|
|
VPTESTM %xmm0, %xmm0, %k2
|
|
|
|
VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
|
|
|
|
kmovd %k1, %ecx
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
subl $0xf, %ecx
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
incw %cx
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
jnz L(check_ret_vec_page_cross)
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
addl $(16 / SIZE_OF_CHAR), %OFFSET_REG
|
|
|
|
subq %OFFSET_REG64, %rdx
|
|
|
|
jbe L(ret_zero_page_cross_slow_case0)
|
|
|
|
subq $-(CHAR_PER_VEC * 4), %rdx
|
|
|
|
|
|
|
|
leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
|
|
|
|
leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
|
|
|
|
# else
|
|
|
|
leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
|
|
|
|
leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
|
|
|
|
# endif
|
|
|
|
jmp L(prepare_loop_aligned)
|
|
|
|
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
.p2align 4,, 2
|
|
|
|
L(ret_zero_page_cross_slow_case0):
|
|
|
|
xorl %eax, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
2022-01-10 21:35:39 +00:00
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
.p2align 4,, 10
|
|
|
|
L(less_16_till_page):
|
|
|
|
cmpl $(24 / SIZE_OF_CHAR), %eax
|
|
|
|
ja L(less_8_till_page)
|
|
|
|
|
|
|
|
/* Use 8 byte comparison. */
|
|
|
|
vmovq (%rdi), %xmm0
|
|
|
|
vmovq (%rsi), %xmm1
|
|
|
|
VPTESTM %xmm0, %xmm0, %k2
|
|
|
|
VPCMP $0, %xmm1, %xmm0, %k1{%k2}
|
2021-03-05 14:24:52 +00:00
|
|
|
kmovd %k1, %ecx
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
subl $0x3, %ecx
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
incb %cl
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
jnz L(check_ret_vec_page_cross)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpq $(8 / SIZE_OF_CHAR), %rdx
|
|
|
|
jbe L(ret_zero_page_cross_slow_case0)
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
movl $(24 / SIZE_OF_CHAR), %OFFSET_REG
|
|
|
|
subl %eax, %OFFSET_REG
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
|
|
|
|
vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
|
|
|
|
VPTESTM %xmm0, %xmm0, %k2
|
|
|
|
VPCMP $0, %xmm1, %xmm0, %k1{%k2}
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
kmovd %k1, %ecx
|
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
subl $0x3, %ecx
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
incb %cl
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
jnz L(check_ret_vec_page_cross)
|
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
|
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
addl $(8 / SIZE_OF_CHAR), %OFFSET_REG
|
|
|
|
subq %OFFSET_REG64, %rdx
|
|
|
|
jbe L(ret_zero_page_cross_slow_case0)
|
|
|
|
subq $-(CHAR_PER_VEC * 4), %rdx
|
|
|
|
|
|
|
|
leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
|
|
|
|
leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
|
|
|
|
# else
|
|
|
|
leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
|
|
|
|
leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
jmp L(prepare_loop_aligned)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
.p2align 4,, 10
|
|
|
|
L(less_8_till_page):
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WCSCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
/* If using wchar then this is the only check before we reach
|
|
|
|
the page boundary. */
|
|
|
|
movl (%rdi), %eax
|
|
|
|
movl (%rsi), %ecx
|
|
|
|
cmpl %ecx, %eax
|
|
|
|
jnz L(ret_less_8_wcs)
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
addq $-(CHAR_PER_VEC * 2), %rdx
|
|
|
|
/* We already checked for len <= 1 so cannot hit that case here.
|
|
|
|
*/
|
|
|
|
# endif
|
|
|
|
testl %eax, %eax
|
|
|
|
jnz L(prepare_loop)
|
|
|
|
ret
|
|
|
|
|
|
|
|
.p2align 4,, 8
|
|
|
|
L(ret_less_8_wcs):
|
|
|
|
setl %OFFSET_REG8
|
|
|
|
negl %OFFSET_REG
|
|
|
|
movl %OFFSET_REG, %eax
|
|
|
|
xorl %r8d, %eax
|
|
|
|
ret
|
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpl $28, %eax
|
|
|
|
ja L(less_4_till_page)
|
|
|
|
|
|
|
|
vmovd (%rdi), %xmm0
|
|
|
|
vmovd (%rsi), %xmm1
|
|
|
|
VPTESTM %xmm0, %xmm0, %k2
|
|
|
|
VPCMP $0, %xmm1, %xmm0, %k1{%k2}
|
|
|
|
kmovd %k1, %ecx
|
|
|
|
subl $0xf, %ecx
|
|
|
|
jnz L(check_ret_vec_page_cross)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
cmpq $4, %rdx
|
|
|
|
jbe L(ret_zero_page_cross_slow_case1)
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-01-10 21:35:39 +00:00
|
|
|
movl $(28 / SIZE_OF_CHAR), %OFFSET_REG
|
|
|
|
subl %eax, %OFFSET_REG
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
|
|
|
|
vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
|
|
|
|
VPTESTM %xmm0, %xmm0, %k2
|
|
|
|
VPCMP $0, %xmm1, %xmm0, %k1{%k2}
|
2021-03-05 14:24:52 +00:00
|
|
|
kmovd %k1, %ecx
|
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
2021-10-29 19:40:20 +00:00
|
|
|
subl $0xf, %ecx
|
2022-01-10 21:35:39 +00:00
|
|
|
jnz L(check_ret_vec_page_cross)
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
addl $(4 / SIZE_OF_CHAR), %OFFSET_REG
|
|
|
|
subq %OFFSET_REG64, %rdx
|
|
|
|
jbe L(ret_zero_page_cross_slow_case1)
|
|
|
|
subq $-(CHAR_PER_VEC * 4), %rdx
|
|
|
|
|
|
|
|
leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
|
|
|
|
leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
|
|
|
|
# else
|
|
|
|
leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
|
|
|
|
leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
|
|
|
|
# endif
|
|
|
|
jmp L(prepare_loop_aligned)
|
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
|
|
|
|
# ifdef USE_AS_STRNCMP
|
2022-01-10 21:35:39 +00:00
|
|
|
.p2align 4,, 2
|
|
|
|
L(ret_zero_page_cross_slow_case1):
|
|
|
|
xorl %eax, %eax
|
|
|
|
ret
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
|
|
|
|
2022-01-10 21:35:39 +00:00
|
|
|
.p2align 4,, 10
|
|
|
|
L(less_4_till_page):
|
|
|
|
subq %rdi, %rsi
|
|
|
|
/* Extremely slow byte comparison loop. */
|
|
|
|
L(less_4_loop):
|
|
|
|
movzbl (%rdi), %eax
|
|
|
|
movzbl (%rsi, %rdi), %ecx
|
2021-03-05 14:24:52 +00:00
|
|
|
subl %ecx, %eax
|
2022-01-10 21:35:39 +00:00
|
|
|
jnz L(ret_less_4_loop)
|
|
|
|
testl %ecx, %ecx
|
|
|
|
jz L(ret_zero_4_loop)
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
decq %rdx
|
|
|
|
jz L(ret_zero_4_loop)
|
|
|
|
# endif
|
|
|
|
incq %rdi
|
|
|
|
/* end condition is reach page boundary (rdi is aligned). */
|
|
|
|
testl $31, %edi
|
|
|
|
jnz L(less_4_loop)
|
|
|
|
leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
|
|
|
|
addq $-(VEC_SIZE * 4), %rdi
|
|
|
|
# ifdef USE_AS_STRNCMP
|
|
|
|
subq $-(CHAR_PER_VEC * 4), %rdx
|
|
|
|
# endif
|
|
|
|
jmp L(prepare_loop_aligned)
|
|
|
|
|
|
|
|
L(ret_zero_4_loop):
|
|
|
|
xorl %eax, %eax
|
|
|
|
ret
|
|
|
|
L(ret_less_4_loop):
|
|
|
|
xorl %r8d, %eax
|
|
|
|
subl %r8d, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
2022-01-10 21:35:39 +00:00
|
|
|
# endif
|
|
|
|
END(STRCMP)
|
2021-03-05 14:24:52 +00:00
|
|
|
#endif
|