mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-11 22:00:08 +00:00
x86: Remove memcmp-sse4.S
Code didn't actually use any sse4 instructions since `ptest` was removed in: commit2f9062d717
Author: Noah Goldstein <goldstein.w.n@gmail.com> Date: Wed Nov 10 16:18:56 2021 -0600 x86: Shrink memcmp-sse4.S code size The new memcmp-sse2 implementation is also faster. geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905 Note there are two regressions preferring SSE2 for Size = 1 and Size = 65. Size = 1: size, align0, align1, ret, New Time/Old Time 1, 1, 1, 0, 1.2 1, 1, 1, 1, 1.197 1, 1, 1, -1, 1.2 This is intentional. Size == 1 is significantly less hot based on profiles of GCC11 and Python3 than sizes [4, 8] (which is made hotter). Python3 Size = 1 -> 13.64% Python3 Size = [4, 8] -> 60.92% GCC11 Size = 1 -> 1.29% GCC11 Size = [4, 8] -> 33.86% size, align0, align1, ret, New Time/Old Time 4, 4, 4, 0, 0.622 4, 4, 4, 1, 0.797 4, 4, 4, -1, 0.805 5, 5, 5, 0, 0.623 5, 5, 5, 1, 0.777 5, 5, 5, -1, 0.802 6, 6, 6, 0, 0.625 6, 6, 6, 1, 0.813 6, 6, 6, -1, 0.788 7, 7, 7, 0, 0.625 7, 7, 7, 1, 0.799 7, 7, 7, -1, 0.795 8, 8, 8, 0, 0.625 8, 8, 8, 1, 0.848 8, 8, 8, -1, 0.914 9, 9, 9, 0, 0.625 Size = 65: size, align0, align1, ret, New Time/Old Time 65, 0, 0, 0, 1.103 65, 0, 0, 1, 1.216 65, 0, 0, -1, 1.227 65, 65, 0, 0, 1.091 65, 0, 65, 1, 1.19 65, 65, 65, -1, 1.215 This is because A) the checks in range [65, 96] are now unrolled 2x and B) because smaller values <= 16 are now given a hotter path. By contrast the SSE4 version has a branch for Size = 80. The unrolled version has get better performance for returns which need both comparisons. size, align0, align1, ret, New Time/Old Time 128, 4, 8, 0, 0.858 128, 4, 8, 1, 0.879 128, 4, 8, -1, 0.888 As well, out of microbenchmark environments that are not full predictable the branch will have a real-cost. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> (cherry picked from commit7cbc03d030
)
This commit is contained in:
parent
4ff6ae069b
commit
ffe75982cc
@ -11,7 +11,6 @@ sysdep_routines += \
|
||||
memcmp-avx2-movbe-rtm \
|
||||
memcmp-evex-movbe \
|
||||
memcmp-sse2 \
|
||||
memcmp-sse4 \
|
||||
memcmp-ssse3 \
|
||||
memcpy-ssse3 \
|
||||
memcpy-ssse3-back \
|
||||
@ -174,7 +173,6 @@ sysdep_routines += \
|
||||
wmemcmp-avx2-movbe-rtm \
|
||||
wmemcmp-c \
|
||||
wmemcmp-evex-movbe \
|
||||
wmemcmp-sse4 \
|
||||
wmemcmp-ssse3 \
|
||||
# sysdep_routines
|
||||
endif
|
||||
|
@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__memcmp_evex_movbe)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
|
||||
__memcmp_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
|
||||
__memcmp_ssse3)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
|
||||
@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__wmemcmp_evex_movbe)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
|
||||
__wmemcmp_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
|
||||
__wmemcmp_ssse3)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
|
||||
|
@ -21,7 +21,6 @@
|
||||
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
|
||||
@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
|
||||
return OPTIMIZE (avx2_movbe);
|
||||
}
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
||||
return OPTIMIZE (sse4_1);
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
|
||||
return OPTIMIZE (ssse3);
|
||||
|
||||
|
@ -1,804 +0,0 @@
|
||||
/* memcmp with SSE4.1, wmemcmp with SSE4.1
|
||||
Copyright (C) 2010-2021 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#if IS_IN (libc)
|
||||
|
||||
# include <sysdep.h>
|
||||
|
||||
# ifndef MEMCMP
|
||||
# define MEMCMP __memcmp_sse4_1
|
||||
# endif
|
||||
|
||||
#ifdef USE_AS_WMEMCMP
|
||||
# define CMPEQ pcmpeqd
|
||||
# define CHAR_SIZE 4
|
||||
#else
|
||||
# define CMPEQ pcmpeqb
|
||||
# define CHAR_SIZE 1
|
||||
#endif
|
||||
|
||||
|
||||
/* Warning!
|
||||
wmemcmp has to use SIGNED comparison for elements.
|
||||
memcmp has to use UNSIGNED comparison for elemnts.
|
||||
*/
|
||||
|
||||
.section .text.sse4.1,"ax",@progbits
|
||||
ENTRY (MEMCMP)
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
shl $2, %RDX_LP
|
||||
# elif defined __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
mov %edx, %edx
|
||||
# endif
|
||||
cmp $79, %RDX_LP
|
||||
ja L(79bytesormore)
|
||||
|
||||
cmp $CHAR_SIZE, %RDX_LP
|
||||
jbe L(firstbyte)
|
||||
|
||||
/* N in (CHAR_SIZE, 79) bytes. */
|
||||
cmpl $32, %edx
|
||||
ja L(more_32_bytes)
|
||||
|
||||
cmpl $16, %edx
|
||||
jae L(16_to_32_bytes)
|
||||
|
||||
# ifndef USE_AS_WMEMCMP
|
||||
cmpl $8, %edx
|
||||
jae L(8_to_16_bytes)
|
||||
|
||||
cmpl $4, %edx
|
||||
jb L(2_to_3_bytes)
|
||||
|
||||
movl (%rdi), %eax
|
||||
movl (%rsi), %ecx
|
||||
|
||||
bswap %eax
|
||||
bswap %ecx
|
||||
|
||||
shlq $32, %rax
|
||||
shlq $32, %rcx
|
||||
|
||||
movl -4(%rdi, %rdx), %edi
|
||||
movl -4(%rsi, %rdx), %esi
|
||||
|
||||
bswap %edi
|
||||
bswap %esi
|
||||
|
||||
orq %rdi, %rax
|
||||
orq %rsi, %rcx
|
||||
subq %rcx, %rax
|
||||
cmovne %edx, %eax
|
||||
sbbl %ecx, %ecx
|
||||
orl %ecx, %eax
|
||||
ret
|
||||
|
||||
.p2align 4,, 8
|
||||
L(2_to_3_bytes):
|
||||
movzwl (%rdi), %eax
|
||||
movzwl (%rsi), %ecx
|
||||
shll $8, %eax
|
||||
shll $8, %ecx
|
||||
bswap %eax
|
||||
bswap %ecx
|
||||
movzbl -1(%rdi, %rdx), %edi
|
||||
movzbl -1(%rsi, %rdx), %esi
|
||||
orl %edi, %eax
|
||||
orl %esi, %ecx
|
||||
subl %ecx, %eax
|
||||
ret
|
||||
|
||||
.p2align 4,, 8
|
||||
L(8_to_16_bytes):
|
||||
movq (%rdi), %rax
|
||||
movq (%rsi), %rcx
|
||||
|
||||
bswap %rax
|
||||
bswap %rcx
|
||||
|
||||
subq %rcx, %rax
|
||||
jne L(8_to_16_bytes_done)
|
||||
|
||||
movq -8(%rdi, %rdx), %rax
|
||||
movq -8(%rsi, %rdx), %rcx
|
||||
|
||||
bswap %rax
|
||||
bswap %rcx
|
||||
|
||||
subq %rcx, %rax
|
||||
|
||||
L(8_to_16_bytes_done):
|
||||
cmovne %edx, %eax
|
||||
sbbl %ecx, %ecx
|
||||
orl %ecx, %eax
|
||||
ret
|
||||
# else
|
||||
xorl %eax, %eax
|
||||
movl (%rdi), %ecx
|
||||
cmpl (%rsi), %ecx
|
||||
jne L(8_to_16_bytes_done)
|
||||
movl 4(%rdi), %ecx
|
||||
cmpl 4(%rsi), %ecx
|
||||
jne L(8_to_16_bytes_done)
|
||||
movl -4(%rdi, %rdx), %ecx
|
||||
cmpl -4(%rsi, %rdx), %ecx
|
||||
jne L(8_to_16_bytes_done)
|
||||
ret
|
||||
# endif
|
||||
|
||||
.p2align 4,, 3
|
||||
L(ret_zero):
|
||||
xorl %eax, %eax
|
||||
L(zero):
|
||||
ret
|
||||
|
||||
.p2align 4,, 8
|
||||
L(firstbyte):
|
||||
jb L(ret_zero)
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
xorl %eax, %eax
|
||||
movl (%rdi), %ecx
|
||||
cmpl (%rsi), %ecx
|
||||
je L(zero)
|
||||
L(8_to_16_bytes_done):
|
||||
setg %al
|
||||
leal -1(%rax, %rax), %eax
|
||||
# else
|
||||
movzbl (%rdi), %eax
|
||||
movzbl (%rsi), %ecx
|
||||
sub %ecx, %eax
|
||||
# endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(vec_return_begin_48):
|
||||
addq $16, %rdi
|
||||
addq $16, %rsi
|
||||
L(vec_return_begin_32):
|
||||
bsfl %eax, %eax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
movl 32(%rdi, %rax), %ecx
|
||||
xorl %edx, %edx
|
||||
cmpl 32(%rsi, %rax), %ecx
|
||||
setg %dl
|
||||
leal -1(%rdx, %rdx), %eax
|
||||
# else
|
||||
movzbl 32(%rsi, %rax), %ecx
|
||||
movzbl 32(%rdi, %rax), %eax
|
||||
subl %ecx, %eax
|
||||
# endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(vec_return_begin_16):
|
||||
addq $16, %rdi
|
||||
addq $16, %rsi
|
||||
L(vec_return_begin):
|
||||
bsfl %eax, %eax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
movl (%rdi, %rax), %ecx
|
||||
xorl %edx, %edx
|
||||
cmpl (%rsi, %rax), %ecx
|
||||
setg %dl
|
||||
leal -1(%rdx, %rdx), %eax
|
||||
# else
|
||||
movzbl (%rsi, %rax), %ecx
|
||||
movzbl (%rdi, %rax), %eax
|
||||
subl %ecx, %eax
|
||||
# endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(vec_return_end_16):
|
||||
subl $16, %edx
|
||||
L(vec_return_end):
|
||||
bsfl %eax, %eax
|
||||
addl %edx, %eax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
movl -16(%rdi, %rax), %ecx
|
||||
xorl %edx, %edx
|
||||
cmpl -16(%rsi, %rax), %ecx
|
||||
setg %dl
|
||||
leal -1(%rdx, %rdx), %eax
|
||||
# else
|
||||
movzbl -16(%rsi, %rax), %ecx
|
||||
movzbl -16(%rdi, %rax), %eax
|
||||
subl %ecx, %eax
|
||||
# endif
|
||||
ret
|
||||
|
||||
.p2align 4,, 8
|
||||
L(more_32_bytes):
|
||||
movdqu (%rdi), %xmm0
|
||||
movdqu (%rsi), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin)
|
||||
|
||||
movdqu 16(%rdi), %xmm0
|
||||
movdqu 16(%rsi), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_16)
|
||||
|
||||
cmpl $64, %edx
|
||||
jbe L(32_to_64_bytes)
|
||||
movdqu 32(%rdi), %xmm0
|
||||
movdqu 32(%rsi), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_32)
|
||||
|
||||
.p2align 4,, 6
|
||||
L(32_to_64_bytes):
|
||||
movdqu -32(%rdi, %rdx), %xmm0
|
||||
movdqu -32(%rsi, %rdx), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_end_16)
|
||||
|
||||
movdqu -16(%rdi, %rdx), %xmm0
|
||||
movdqu -16(%rsi, %rdx), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_end)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(16_to_32_bytes):
|
||||
movdqu (%rdi), %xmm0
|
||||
movdqu (%rsi), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin)
|
||||
|
||||
movdqu -16(%rdi, %rdx), %xmm0
|
||||
movdqu -16(%rsi, %rdx), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_end)
|
||||
ret
|
||||
|
||||
|
||||
.p2align 4
|
||||
L(79bytesormore):
|
||||
movdqu (%rdi), %xmm0
|
||||
movdqu (%rsi), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin)
|
||||
|
||||
|
||||
mov %rsi, %rcx
|
||||
and $-16, %rsi
|
||||
add $16, %rsi
|
||||
sub %rsi, %rcx
|
||||
|
||||
sub %rcx, %rdi
|
||||
add %rcx, %rdx
|
||||
test $0xf, %rdi
|
||||
jz L(2aligned)
|
||||
|
||||
cmp $128, %rdx
|
||||
ja L(128bytesormore)
|
||||
|
||||
.p2align 4,, 6
|
||||
L(less128bytes):
|
||||
movdqu (%rdi), %xmm1
|
||||
CMPEQ (%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin)
|
||||
|
||||
movdqu 16(%rdi), %xmm1
|
||||
CMPEQ 16(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_16)
|
||||
|
||||
movdqu 32(%rdi), %xmm1
|
||||
CMPEQ 32(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_32)
|
||||
|
||||
movdqu 48(%rdi), %xmm1
|
||||
CMPEQ 48(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_48)
|
||||
|
||||
cmp $96, %rdx
|
||||
jb L(32_to_64_bytes)
|
||||
|
||||
addq $64, %rdi
|
||||
addq $64, %rsi
|
||||
subq $64, %rdx
|
||||
|
||||
.p2align 4,, 6
|
||||
L(last_64_bytes):
|
||||
movdqu (%rdi), %xmm1
|
||||
CMPEQ (%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin)
|
||||
|
||||
movdqu 16(%rdi), %xmm1
|
||||
CMPEQ 16(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_16)
|
||||
|
||||
movdqu -32(%rdi, %rdx), %xmm0
|
||||
movdqu -32(%rsi, %rdx), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_end_16)
|
||||
|
||||
movdqu -16(%rdi, %rdx), %xmm0
|
||||
movdqu -16(%rsi, %rdx), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_end)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(128bytesormore):
|
||||
cmp $256, %rdx
|
||||
ja L(unaligned_loop)
|
||||
L(less256bytes):
|
||||
movdqu (%rdi), %xmm1
|
||||
CMPEQ (%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin)
|
||||
|
||||
movdqu 16(%rdi), %xmm1
|
||||
CMPEQ 16(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_16)
|
||||
|
||||
movdqu 32(%rdi), %xmm1
|
||||
CMPEQ 32(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_32)
|
||||
|
||||
movdqu 48(%rdi), %xmm1
|
||||
CMPEQ 48(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_48)
|
||||
|
||||
addq $64, %rdi
|
||||
addq $64, %rsi
|
||||
|
||||
movdqu (%rdi), %xmm1
|
||||
CMPEQ (%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin)
|
||||
|
||||
movdqu 16(%rdi), %xmm1
|
||||
CMPEQ 16(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_16)
|
||||
|
||||
movdqu 32(%rdi), %xmm1
|
||||
CMPEQ 32(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_32)
|
||||
|
||||
movdqu 48(%rdi), %xmm1
|
||||
CMPEQ 48(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_48)
|
||||
|
||||
addq $-128, %rdx
|
||||
subq $-64, %rsi
|
||||
subq $-64, %rdi
|
||||
|
||||
cmp $64, %rdx
|
||||
ja L(less128bytes)
|
||||
|
||||
cmp $32, %rdx
|
||||
ja L(last_64_bytes)
|
||||
|
||||
movdqu -32(%rdi, %rdx), %xmm0
|
||||
movdqu -32(%rsi, %rdx), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_end_16)
|
||||
|
||||
movdqu -16(%rdi, %rdx), %xmm0
|
||||
movdqu -16(%rsi, %rdx), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_end)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(unaligned_loop):
|
||||
# ifdef DATA_CACHE_SIZE_HALF
|
||||
mov $DATA_CACHE_SIZE_HALF, %R8_LP
|
||||
# else
|
||||
mov __x86_data_cache_size_half(%rip), %R8_LP
|
||||
# endif
|
||||
movq %r8, %r9
|
||||
addq %r8, %r8
|
||||
addq %r9, %r8
|
||||
cmpq %r8, %rdx
|
||||
ja L(L2_L3_cache_unaligned)
|
||||
sub $64, %rdx
|
||||
.p2align 4
|
||||
L(64bytesormore_loop):
|
||||
movdqu (%rdi), %xmm0
|
||||
movdqu 16(%rdi), %xmm1
|
||||
movdqu 32(%rdi), %xmm2
|
||||
movdqu 48(%rdi), %xmm3
|
||||
|
||||
CMPEQ (%rsi), %xmm0
|
||||
CMPEQ 16(%rsi), %xmm1
|
||||
CMPEQ 32(%rsi), %xmm2
|
||||
CMPEQ 48(%rsi), %xmm3
|
||||
|
||||
pand %xmm0, %xmm1
|
||||
pand %xmm2, %xmm3
|
||||
pand %xmm1, %xmm3
|
||||
|
||||
pmovmskb %xmm3, %eax
|
||||
incw %ax
|
||||
jnz L(64bytesormore_loop_end)
|
||||
|
||||
add $64, %rsi
|
||||
add $64, %rdi
|
||||
sub $64, %rdx
|
||||
ja L(64bytesormore_loop)
|
||||
|
||||
.p2align 4,, 6
|
||||
L(loop_tail):
|
||||
addq %rdx, %rdi
|
||||
movdqu (%rdi), %xmm0
|
||||
movdqu 16(%rdi), %xmm1
|
||||
movdqu 32(%rdi), %xmm2
|
||||
movdqu 48(%rdi), %xmm3
|
||||
|
||||
addq %rdx, %rsi
|
||||
movdqu (%rsi), %xmm4
|
||||
movdqu 16(%rsi), %xmm5
|
||||
movdqu 32(%rsi), %xmm6
|
||||
movdqu 48(%rsi), %xmm7
|
||||
|
||||
CMPEQ %xmm4, %xmm0
|
||||
CMPEQ %xmm5, %xmm1
|
||||
CMPEQ %xmm6, %xmm2
|
||||
CMPEQ %xmm7, %xmm3
|
||||
|
||||
pand %xmm0, %xmm1
|
||||
pand %xmm2, %xmm3
|
||||
pand %xmm1, %xmm3
|
||||
|
||||
pmovmskb %xmm3, %eax
|
||||
incw %ax
|
||||
jnz L(64bytesormore_loop_end)
|
||||
ret
|
||||
|
||||
L(L2_L3_cache_unaligned):
|
||||
subq $64, %rdx
|
||||
.p2align 4
|
||||
L(L2_L3_unaligned_128bytes_loop):
|
||||
prefetchnta 0x1c0(%rdi)
|
||||
prefetchnta 0x1c0(%rsi)
|
||||
|
||||
movdqu (%rdi), %xmm0
|
||||
movdqu 16(%rdi), %xmm1
|
||||
movdqu 32(%rdi), %xmm2
|
||||
movdqu 48(%rdi), %xmm3
|
||||
|
||||
CMPEQ (%rsi), %xmm0
|
||||
CMPEQ 16(%rsi), %xmm1
|
||||
CMPEQ 32(%rsi), %xmm2
|
||||
CMPEQ 48(%rsi), %xmm3
|
||||
|
||||
pand %xmm0, %xmm1
|
||||
pand %xmm2, %xmm3
|
||||
pand %xmm1, %xmm3
|
||||
|
||||
pmovmskb %xmm3, %eax
|
||||
incw %ax
|
||||
jnz L(64bytesormore_loop_end)
|
||||
|
||||
add $64, %rsi
|
||||
add $64, %rdi
|
||||
sub $64, %rdx
|
||||
ja L(L2_L3_unaligned_128bytes_loop)
|
||||
jmp L(loop_tail)
|
||||
|
||||
|
||||
/* This case is for machines which are sensitive for unaligned
|
||||
* instructions. */
|
||||
.p2align 4
|
||||
L(2aligned):
|
||||
cmp $128, %rdx
|
||||
ja L(128bytesormorein2aligned)
|
||||
L(less128bytesin2aligned):
|
||||
movdqa (%rdi), %xmm1
|
||||
CMPEQ (%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin)
|
||||
|
||||
movdqa 16(%rdi), %xmm1
|
||||
CMPEQ 16(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_16)
|
||||
|
||||
movdqa 32(%rdi), %xmm1
|
||||
CMPEQ 32(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_32)
|
||||
|
||||
movdqa 48(%rdi), %xmm1
|
||||
CMPEQ 48(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_48)
|
||||
|
||||
cmp $96, %rdx
|
||||
jb L(32_to_64_bytes)
|
||||
|
||||
addq $64, %rdi
|
||||
addq $64, %rsi
|
||||
subq $64, %rdx
|
||||
|
||||
.p2align 4,, 6
|
||||
L(aligned_last_64_bytes):
|
||||
movdqa (%rdi), %xmm1
|
||||
CMPEQ (%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin)
|
||||
|
||||
movdqa 16(%rdi), %xmm1
|
||||
CMPEQ 16(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_16)
|
||||
|
||||
movdqu -32(%rdi, %rdx), %xmm0
|
||||
movdqu -32(%rsi, %rdx), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_end_16)
|
||||
|
||||
movdqu -16(%rdi, %rdx), %xmm0
|
||||
movdqu -16(%rsi, %rdx), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_end)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(128bytesormorein2aligned):
|
||||
cmp $256, %rdx
|
||||
ja L(aligned_loop)
|
||||
L(less256bytesin2alinged):
|
||||
movdqa (%rdi), %xmm1
|
||||
CMPEQ (%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin)
|
||||
|
||||
movdqa 16(%rdi), %xmm1
|
||||
CMPEQ 16(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_16)
|
||||
|
||||
movdqa 32(%rdi), %xmm1
|
||||
CMPEQ 32(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_32)
|
||||
|
||||
movdqa 48(%rdi), %xmm1
|
||||
CMPEQ 48(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_48)
|
||||
|
||||
addq $64, %rdi
|
||||
addq $64, %rsi
|
||||
|
||||
movdqa (%rdi), %xmm1
|
||||
CMPEQ (%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin)
|
||||
|
||||
movdqa 16(%rdi), %xmm1
|
||||
CMPEQ 16(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_16)
|
||||
|
||||
movdqa 32(%rdi), %xmm1
|
||||
CMPEQ 32(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_32)
|
||||
|
||||
movdqa 48(%rdi), %xmm1
|
||||
CMPEQ 48(%rsi), %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_begin_48)
|
||||
|
||||
addq $-128, %rdx
|
||||
subq $-64, %rsi
|
||||
subq $-64, %rdi
|
||||
|
||||
cmp $64, %rdx
|
||||
ja L(less128bytesin2aligned)
|
||||
|
||||
cmp $32, %rdx
|
||||
ja L(aligned_last_64_bytes)
|
||||
|
||||
movdqu -32(%rdi, %rdx), %xmm0
|
||||
movdqu -32(%rsi, %rdx), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_end_16)
|
||||
|
||||
movdqu -16(%rdi, %rdx), %xmm0
|
||||
movdqu -16(%rsi, %rdx), %xmm1
|
||||
CMPEQ %xmm0, %xmm1
|
||||
pmovmskb %xmm1, %eax
|
||||
incw %ax
|
||||
jnz L(vec_return_end)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(aligned_loop):
|
||||
# ifdef DATA_CACHE_SIZE_HALF
|
||||
mov $DATA_CACHE_SIZE_HALF, %R8_LP
|
||||
# else
|
||||
mov __x86_data_cache_size_half(%rip), %R8_LP
|
||||
# endif
|
||||
movq %r8, %r9
|
||||
addq %r8, %r8
|
||||
addq %r9, %r8
|
||||
cmpq %r8, %rdx
|
||||
ja L(L2_L3_cache_aligned)
|
||||
|
||||
sub $64, %rdx
|
||||
.p2align 4
|
||||
L(64bytesormore_loopin2aligned):
|
||||
movdqa (%rdi), %xmm0
|
||||
movdqa 16(%rdi), %xmm1
|
||||
movdqa 32(%rdi), %xmm2
|
||||
movdqa 48(%rdi), %xmm3
|
||||
|
||||
CMPEQ (%rsi), %xmm0
|
||||
CMPEQ 16(%rsi), %xmm1
|
||||
CMPEQ 32(%rsi), %xmm2
|
||||
CMPEQ 48(%rsi), %xmm3
|
||||
|
||||
pand %xmm0, %xmm1
|
||||
pand %xmm2, %xmm3
|
||||
pand %xmm1, %xmm3
|
||||
|
||||
pmovmskb %xmm3, %eax
|
||||
incw %ax
|
||||
jnz L(64bytesormore_loop_end)
|
||||
add $64, %rsi
|
||||
add $64, %rdi
|
||||
sub $64, %rdx
|
||||
ja L(64bytesormore_loopin2aligned)
|
||||
jmp L(loop_tail)
|
||||
|
||||
L(L2_L3_cache_aligned):
|
||||
subq $64, %rdx
|
||||
.p2align 4
|
||||
L(L2_L3_aligned_128bytes_loop):
|
||||
prefetchnta 0x1c0(%rdi)
|
||||
prefetchnta 0x1c0(%rsi)
|
||||
movdqa (%rdi), %xmm0
|
||||
movdqa 16(%rdi), %xmm1
|
||||
movdqa 32(%rdi), %xmm2
|
||||
movdqa 48(%rdi), %xmm3
|
||||
|
||||
CMPEQ (%rsi), %xmm0
|
||||
CMPEQ 16(%rsi), %xmm1
|
||||
CMPEQ 32(%rsi), %xmm2
|
||||
CMPEQ 48(%rsi), %xmm3
|
||||
|
||||
pand %xmm0, %xmm1
|
||||
pand %xmm2, %xmm3
|
||||
pand %xmm1, %xmm3
|
||||
|
||||
pmovmskb %xmm3, %eax
|
||||
incw %ax
|
||||
jnz L(64bytesormore_loop_end)
|
||||
|
||||
addq $64, %rsi
|
||||
addq $64, %rdi
|
||||
subq $64, %rdx
|
||||
ja L(L2_L3_aligned_128bytes_loop)
|
||||
jmp L(loop_tail)
|
||||
|
||||
.p2align 4
|
||||
L(64bytesormore_loop_end):
|
||||
pmovmskb %xmm0, %ecx
|
||||
incw %cx
|
||||
jnz L(loop_end_ret)
|
||||
|
||||
pmovmskb %xmm1, %ecx
|
||||
notw %cx
|
||||
sall $16, %ecx
|
||||
jnz L(loop_end_ret)
|
||||
|
||||
pmovmskb %xmm2, %ecx
|
||||
notw %cx
|
||||
shlq $32, %rcx
|
||||
jnz L(loop_end_ret)
|
||||
|
||||
addq $48, %rdi
|
||||
addq $48, %rsi
|
||||
movq %rax, %rcx
|
||||
|
||||
.p2align 4,, 6
|
||||
L(loop_end_ret):
|
||||
bsfq %rcx, %rcx
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
movl (%rdi, %rcx), %eax
|
||||
xorl %edx, %edx
|
||||
cmpl (%rsi, %rcx), %eax
|
||||
setg %dl
|
||||
leal -1(%rdx, %rdx), %eax
|
||||
# else
|
||||
movzbl (%rdi, %rcx), %eax
|
||||
movzbl (%rsi, %rcx), %ecx
|
||||
subl %ecx, %eax
|
||||
# endif
|
||||
ret
|
||||
END (MEMCMP)
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user