mirror of
https://sourceware.org/git/glibc.git
synced 2024-12-26 12:41:05 +00:00
591 lines
14 KiB
ArmAsm
591 lines
14 KiB
ArmAsm
/* memcmp with SSE2.
|
|
Copyright (C) 2017-2024 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
|
|
#include <isa-level.h>
|
|
|
|
/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
|
|
so we need this to build for ISA V2 builds. */
|
|
#if ISA_SHOULD_BUILD (2)
|
|
|
|
#include <sysdep.h>
|
|
|
|
# ifndef MEMCMP
|
|
# define MEMCMP __memcmp_sse2
|
|
# endif
|
|
|
|
# ifdef USE_AS_WMEMCMP
|
|
# define PCMPEQ pcmpeqd
|
|
# define CHAR_SIZE 4
|
|
# define SIZE_OFFSET (0)
|
|
# else
|
|
# define PCMPEQ pcmpeqb
|
|
# define CHAR_SIZE 1
|
|
# endif
|
|
|
|
# ifdef USE_AS_MEMCMPEQ
|
|
# define SIZE_OFFSET (0)
|
|
# define CHECK_CMP(x, y) subl x, y
|
|
# else
|
|
# ifndef SIZE_OFFSET
|
|
# define SIZE_OFFSET (CHAR_PER_VEC * 2)
|
|
# endif
|
|
# define CHECK_CMP(x, y) cmpl x, y
|
|
# endif
|
|
|
|
# define VEC_SIZE 16
|
|
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
|
|
# ifndef MEMCMP
|
|
# define MEMCMP memcmp
|
|
# endif
|
|
|
|
.text
|
|
ENTRY(MEMCMP)
|
|
# ifdef __ILP32__
|
|
/* Clear the upper 32 bits. */
|
|
movl %edx, %edx
|
|
# endif
|
|
# ifdef USE_AS_WMEMCMP
|
|
/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
|
|
in ecx for code size. This is preferable to using `incw` as
|
|
it avoids partial register stalls on older hardware (pre
|
|
SnB). */
|
|
movl $0xffff, %ecx
|
|
# endif
|
|
cmpq $CHAR_PER_VEC, %rdx
|
|
ja L(more_1x_vec)
|
|
|
|
# ifdef USE_AS_WMEMCMP
|
|
/* saves a byte of code keeping the fall through path n = [2, 4]
|
|
in the initial cache line. */
|
|
decl %edx
|
|
jle L(cmp_0_1)
|
|
|
|
movq (%rsi), %xmm0
|
|
movq (%rdi), %xmm1
|
|
PCMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
subl %ecx, %eax
|
|
jnz L(ret_nonzero_vec_start_0)
|
|
|
|
movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
|
|
movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
|
|
PCMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
subl %ecx, %eax
|
|
jnz L(ret_nonzero_vec_end_0_adj)
|
|
# else
|
|
cmpl $8, %edx
|
|
ja L(cmp_9_16)
|
|
|
|
cmpl $4, %edx
|
|
jb L(cmp_0_3)
|
|
|
|
# ifdef USE_AS_MEMCMPEQ
|
|
movl (%rsi), %eax
|
|
subl (%rdi), %eax
|
|
|
|
movl -4(%rsi, %rdx), %esi
|
|
subl -4(%rdi, %rdx), %esi
|
|
|
|
orl %esi, %eax
|
|
ret
|
|
# else
|
|
/* Combine comparisons for lo and hi 4-byte comparisons. */
|
|
movl -4(%rsi, %rdx), %ecx
|
|
movl -4(%rdi, %rdx), %eax
|
|
shlq $32, %rcx
|
|
shlq $32, %rax
|
|
movl (%rsi), %esi
|
|
movl (%rdi), %edi
|
|
orq %rsi, %rcx
|
|
orq %rdi, %rax
|
|
/* Only compute proper return if not-equal. */
|
|
cmpq %rcx, %rax
|
|
jnz L(ret_nonzero)
|
|
xorl %eax, %eax
|
|
ret
|
|
# endif
|
|
|
|
.p2align 4,, 10
|
|
L(cmp_9_16):
|
|
# ifdef USE_AS_MEMCMPEQ
|
|
movq (%rsi), %rax
|
|
subq (%rdi), %rax
|
|
|
|
movq -8(%rsi, %rdx), %rcx
|
|
subq -8(%rdi, %rdx), %rcx
|
|
orq %rcx, %rax
|
|
/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
|
|
return long). */
|
|
setnz %cl
|
|
movzbl %cl, %eax
|
|
# else
|
|
movq (%rsi), %rcx
|
|
movq (%rdi), %rax
|
|
/* Only compute proper return if not-equal. */
|
|
cmpq %rcx, %rax
|
|
jnz L(ret_nonzero)
|
|
|
|
movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
|
|
movq -8(%rdi, %rdx, CHAR_SIZE), %rax
|
|
/* Only compute proper return if not-equal. */
|
|
cmpq %rcx, %rax
|
|
jnz L(ret_nonzero)
|
|
xorl %eax, %eax
|
|
# endif
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4,, 8
|
|
L(cmp_0_1):
|
|
/* Flag set by earlier comparison against 1. */
|
|
jne L(cmp_0_0)
|
|
# ifdef USE_AS_WMEMCMP
|
|
movl (%rdi), %ecx
|
|
xorl %edx, %edx
|
|
cmpl (%rsi), %ecx
|
|
je L(cmp_0_0)
|
|
setg %dl
|
|
leal -1(%rdx, %rdx), %eax
|
|
# else
|
|
movzbl (%rdi), %eax
|
|
movzbl (%rsi), %ecx
|
|
subl %ecx, %eax
|
|
# endif
|
|
ret
|
|
|
|
/* Fits in aligning bytes. */
|
|
L(cmp_0_0):
|
|
xorl %eax, %eax
|
|
ret
|
|
|
|
# ifdef USE_AS_WMEMCMP
|
|
.p2align 4
|
|
L(ret_nonzero_vec_start_0):
|
|
bsfl %eax, %eax
|
|
movl (%rdi, %rax), %ecx
|
|
xorl %edx, %edx
|
|
cmpl (%rsi, %rax), %ecx
|
|
/* NB: no partial register stall here because xorl zero idiom
|
|
above. */
|
|
setg %dl
|
|
leal -1(%rdx, %rdx), %eax
|
|
ret
|
|
# else
|
|
|
|
# ifndef USE_AS_MEMCMPEQ
|
|
.p2align 4,, 14
|
|
L(ret_nonzero):
|
|
/* Need to bswap to get proper return without branch. */
|
|
bswapq %rcx
|
|
bswapq %rax
|
|
subq %rcx, %rax
|
|
sbbl %eax, %eax
|
|
orl $1, %eax
|
|
ret
|
|
# endif
|
|
|
|
.p2align 4
|
|
L(cmp_0_3):
|
|
# ifdef USE_AS_MEMCMPEQ
|
|
/* No reason to add to dependency chain on rdx. Saving a the
|
|
bytes here doesn't change number of fetch blocks. */
|
|
cmpl $1, %edx
|
|
jbe L(cmp_0_1)
|
|
# else
|
|
/* We need the code size to prevent taking an extra fetch block.
|
|
*/
|
|
decl %edx
|
|
jle L(cmp_0_1)
|
|
# endif
|
|
movzwl (%rsi), %ecx
|
|
movzwl (%rdi), %eax
|
|
|
|
# ifdef USE_AS_MEMCMPEQ
|
|
subl %ecx, %eax
|
|
|
|
movzbl -1(%rsi, %rdx), %esi
|
|
movzbl -1(%rdi, %rdx), %edi
|
|
subl %edi, %esi
|
|
orl %esi, %eax
|
|
# else
|
|
bswapl %ecx
|
|
bswapl %eax
|
|
|
|
/* Implicit right shift by one. We just need to displace the
|
|
sign bits. */
|
|
shrl %ecx
|
|
shrl %eax
|
|
|
|
/* Eat a partial register stall here. Saves code stopping
|
|
L(cmp_0_3) from bleeding into the next fetch block and saves
|
|
an ALU. */
|
|
movb (%rsi, %rdx), %cl
|
|
movzbl (%rdi, %rdx), %edi
|
|
orl %edi, %eax
|
|
subl %ecx, %eax
|
|
# endif
|
|
ret
|
|
# endif
|
|
|
|
.p2align 5
|
|
L(more_1x_vec):
|
|
# ifndef USE_AS_WMEMCMP
|
|
/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
|
|
in ecx for code size. This is preferable to using `incw` as
|
|
it avoids partial register stalls on older hardware (pre
|
|
SnB). */
|
|
movl $0xffff, %ecx
|
|
# endif
|
|
movups (%rsi), %xmm0
|
|
movups (%rdi), %xmm1
|
|
PCMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
subl %ecx, %eax
|
|
jnz L(ret_nonzero_vec_start_0)
|
|
# if SIZE_OFFSET == 0
|
|
cmpq $(CHAR_PER_VEC * 2), %rdx
|
|
# else
|
|
/* Offset rdx. Saves just enough code size to keep the
|
|
L(last_2x_vec) case and the non-zero return in a single
|
|
cache line. */
|
|
subq $(CHAR_PER_VEC * 2), %rdx
|
|
# endif
|
|
ja L(more_2x_vec)
|
|
|
|
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
|
|
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
|
|
PCMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
subl %ecx, %eax
|
|
# ifndef USE_AS_MEMCMPEQ
|
|
/* Don't use `incw ax` as machines this code runs on are liable
|
|
to have partial register stall. */
|
|
jnz L(ret_nonzero_vec_end_0)
|
|
# else
|
|
/* Various return targets for memcmpeq. Will always be hot in
|
|
Icache and get short encoding. */
|
|
L(ret_nonzero_vec_start_1):
|
|
L(ret_nonzero_vec_start_0):
|
|
L(ret_nonzero_vec_end_0):
|
|
# endif
|
|
ret
|
|
|
|
# ifndef USE_AS_MEMCMPEQ
|
|
# ifdef USE_AS_WMEMCMP
|
|
.p2align 4
|
|
L(ret_nonzero_vec_end_0_adj):
|
|
addl $3, %edx
|
|
# else
|
|
.p2align 4,, 8
|
|
# endif
|
|
L(ret_nonzero_vec_end_0):
|
|
bsfl %eax, %eax
|
|
# ifdef USE_AS_WMEMCMP
|
|
leal (%rax, %rdx, CHAR_SIZE), %eax
|
|
movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
|
|
xorl %edx, %edx
|
|
cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
|
|
/* NB: no partial register stall here because xorl zero idiom
|
|
above. */
|
|
setg %dl
|
|
leal -1(%rdx, %rdx), %eax
|
|
# else
|
|
/* Use `addq` instead of `addl` here so that even if `rax` + `rdx`
|
|
is negative value of the sum will be usable as a 64-bit offset
|
|
(negative 32-bit numbers zero-extend to a large and often
|
|
out-of-bounds 64-bit offsets). Note that `rax` + `rdx` >= 0 is
|
|
an invariant when `memcmp` is used correctly, but if the input
|
|
strings `rsi`/`rdi` are concurrently modified as the function
|
|
runs (there is a Data-Race) it is possible for `rax` + `rdx` to
|
|
be negative. Given that there is virtually no extra to cost
|
|
using `addq` instead of `addl` we may as well protect the
|
|
data-race case. */
|
|
addq %rdx, %rax
|
|
movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
|
|
movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
|
|
subl %ecx, %eax
|
|
# endif
|
|
ret
|
|
# ifndef USE_AS_WMEMCMP
|
|
.p2align 4,, 10
|
|
L(ret_nonzero_vec_start_0):
|
|
bsfl %eax, %eax
|
|
movzbl (%rsi, %rax), %ecx
|
|
movzbl (%rdi, %rax), %eax
|
|
subl %ecx, %eax
|
|
ret
|
|
# endif
|
|
# else
|
|
# endif
|
|
|
|
.p2align 5
|
|
L(more_2x_vec):
|
|
movups (VEC_SIZE * 1)(%rsi), %xmm0
|
|
movups (VEC_SIZE * 1)(%rdi), %xmm1
|
|
PCMPEQ %xmm0, %xmm1
|
|
pmovmskb %xmm1, %eax
|
|
subl %ecx, %eax
|
|
jnz L(ret_nonzero_vec_start_1)
|
|
|
|
cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
|
|
jbe L(last_2x_vec)
|
|
|
|
cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
|
|
ja L(more_8x_vec)
|
|
|
|
/* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
|
|
This can harm performance if non-zero return in [65, 80] or
|
|
[97, 112] but helps performance otherwise. Generally zero-
|
|
return is hotter. */
|
|
movups (VEC_SIZE * 2)(%rsi), %xmm0
|
|
movups (VEC_SIZE * 2)(%rdi), %xmm1
|
|
PCMPEQ %xmm0, %xmm1
|
|
movups (VEC_SIZE * 3)(%rsi), %xmm2
|
|
movups (VEC_SIZE * 3)(%rdi), %xmm3
|
|
PCMPEQ %xmm2, %xmm3
|
|
pand %xmm1, %xmm3
|
|
|
|
pmovmskb %xmm3, %eax
|
|
CHECK_CMP (%ecx, %eax)
|
|
jnz L(ret_nonzero_vec_start_2_3)
|
|
|
|
cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
|
|
jbe L(last_2x_vec)
|
|
|
|
movups (VEC_SIZE * 4)(%rsi), %xmm0
|
|
movups (VEC_SIZE * 4)(%rdi), %xmm1
|
|
PCMPEQ %xmm0, %xmm1
|
|
movups (VEC_SIZE * 5)(%rsi), %xmm2
|
|
movups (VEC_SIZE * 5)(%rdi), %xmm3
|
|
PCMPEQ %xmm2, %xmm3
|
|
pand %xmm1, %xmm3
|
|
|
|
pmovmskb %xmm3, %eax
|
|
CHECK_CMP (%ecx, %eax)
|
|
# ifdef USE_AS_MEMCMPEQ
|
|
jz L(last_2x_vec)
|
|
ret
|
|
# else
|
|
jnz L(ret_nonzero_vec_start_4_5)
|
|
# endif
|
|
.p2align 4
|
|
L(last_2x_vec):
|
|
movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
|
|
movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
|
|
PCMPEQ %xmm0, %xmm1
|
|
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
|
|
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
|
|
PCMPEQ %xmm2, %xmm3
|
|
pand %xmm1, %xmm3
|
|
pmovmskb %xmm3, %eax
|
|
subl %ecx, %eax
|
|
# ifdef USE_AS_MEMCMPEQ
|
|
/* Various return targets for memcmpeq. Will always be hot in
|
|
Icache and get short encoding. */
|
|
L(ret_nonzero_vec_start_2_3):
|
|
L(ret_nonzero_vec_start_4_5):
|
|
ret
|
|
# else
|
|
jnz L(ret_nonzero_vec_end_1)
|
|
ret
|
|
|
|
.p2align 4,, 8
|
|
L(ret_nonzero_vec_end_1):
|
|
pmovmskb %xmm1, %ecx
|
|
/* High 16 bits of eax guaranteed to be all ones. Rotate them in
|
|
to we can do `or + not` with just `xor`. */
|
|
rorl $16, %eax
|
|
xorl %ecx, %eax
|
|
/* Partial register stall. */
|
|
|
|
bsfl %eax, %eax
|
|
# ifdef USE_AS_WMEMCMP
|
|
leal (%rax, %rdx, CHAR_SIZE), %eax
|
|
movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
|
|
xorl %edx, %edx
|
|
cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
|
|
/* NB: no partial register stall here because xorl zero idiom
|
|
above. */
|
|
setg %dl
|
|
leal -1(%rdx, %rdx), %eax
|
|
# else
|
|
addl %edx, %eax
|
|
movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
|
|
movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
|
|
subl %ecx, %eax
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(ret_nonzero_vec_start_4_5):
|
|
pmovmskb %xmm1, %edx
|
|
sall $16, %eax
|
|
leal 1(%rax, %rdx), %eax
|
|
bsfl %eax, %eax
|
|
# ifdef USE_AS_WMEMCMP
|
|
movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
|
|
xorl %edx, %edx
|
|
cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
|
|
/* NB: no partial register stall here because xorl zero idiom
|
|
above. */
|
|
setg %dl
|
|
leal -1(%rdx, %rdx), %eax
|
|
# else
|
|
movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
|
|
movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
|
|
subl %ecx, %eax
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4,, 8
|
|
L(ret_nonzero_vec_start_1):
|
|
bsfl %eax, %eax
|
|
# ifdef USE_AS_WMEMCMP
|
|
movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
|
|
xorl %edx, %edx
|
|
cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
|
|
/* NB: no partial register stall here because xorl zero idiom
|
|
above. */
|
|
setg %dl
|
|
leal -1(%rdx, %rdx), %eax
|
|
# else
|
|
movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
|
|
movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
|
|
subl %ecx, %eax
|
|
# endif
|
|
ret
|
|
# endif
|
|
|
|
.p2align 4
|
|
L(more_8x_vec):
|
|
subq %rdi, %rsi
|
|
leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
|
|
andq $(VEC_SIZE * -1), %rdi
|
|
addq %rdi, %rsi
|
|
.p2align 4
|
|
L(loop_4x):
|
|
movups (VEC_SIZE * 2)(%rsi), %xmm0
|
|
movups (VEC_SIZE * 3)(%rsi), %xmm1
|
|
|
|
PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
|
|
PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
|
|
|
|
movups (VEC_SIZE * 4)(%rsi), %xmm2
|
|
movups (VEC_SIZE * 5)(%rsi), %xmm3
|
|
|
|
PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
|
|
PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
|
|
|
|
pand %xmm0, %xmm1
|
|
pand %xmm2, %xmm3
|
|
pand %xmm1, %xmm3
|
|
|
|
pmovmskb %xmm3, %eax
|
|
subl %ecx, %eax
|
|
jnz L(ret_nonzero_loop)
|
|
|
|
addq $(VEC_SIZE * 4), %rdi
|
|
addq $(VEC_SIZE * 4), %rsi
|
|
cmpq %rdi, %rdx
|
|
ja L(loop_4x)
|
|
/* Get remaining length in edx. */
|
|
subl %edi, %edx
|
|
/* Restore offset so we can reuse L(last_2x_vec). */
|
|
addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
|
|
# ifdef USE_AS_WMEMCMP
|
|
shrl $2, %edx
|
|
# endif
|
|
cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
|
|
jbe L(last_2x_vec)
|
|
|
|
|
|
movups (VEC_SIZE * 2)(%rsi), %xmm0
|
|
movups (VEC_SIZE * 2)(%rdi), %xmm1
|
|
PCMPEQ %xmm0, %xmm1
|
|
movups (VEC_SIZE * 3)(%rsi), %xmm2
|
|
movups (VEC_SIZE * 3)(%rdi), %xmm3
|
|
PCMPEQ %xmm2, %xmm3
|
|
pand %xmm1, %xmm3
|
|
|
|
pmovmskb %xmm3, %eax
|
|
CHECK_CMP (%ecx, %eax)
|
|
jz L(last_2x_vec)
|
|
# ifdef USE_AS_MEMCMPEQ
|
|
L(ret_nonzero_loop):
|
|
ret
|
|
# else
|
|
|
|
.p2align 4
|
|
L(ret_nonzero_vec_start_2_3):
|
|
pmovmskb %xmm1, %edx
|
|
sall $16, %eax
|
|
leal 1(%rax, %rdx), %eax
|
|
|
|
bsfl %eax, %eax
|
|
# ifdef USE_AS_WMEMCMP
|
|
movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
|
|
xorl %edx, %edx
|
|
cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
/* NB: no partial register stall here because xorl zero idiom
|
|
above. */
|
|
setg %dl
|
|
leal -1(%rdx, %rdx), %eax
|
|
# else
|
|
movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
|
|
subl %ecx, %eax
|
|
# endif
|
|
ret
|
|
|
|
.p2align 4
|
|
L(ret_nonzero_loop):
|
|
pmovmskb %xmm0, %ecx
|
|
pmovmskb %xmm1, %edx
|
|
sall $(VEC_SIZE * 1), %edx
|
|
leal 1(%rcx, %rdx), %edx
|
|
pmovmskb %xmm2, %ecx
|
|
/* High 16 bits of eax guaranteed to be all ones. Rotate them in
|
|
to we can do `or + not` with just `xor`. */
|
|
rorl $16, %eax
|
|
xorl %ecx, %eax
|
|
|
|
salq $32, %rax
|
|
orq %rdx, %rax
|
|
|
|
bsfq %rax, %rax
|
|
# ifdef USE_AS_WMEMCMP
|
|
movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
|
|
xorl %edx, %edx
|
|
cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
/* NB: no partial register stall here because xorl zero idiom
|
|
above. */
|
|
setg %dl
|
|
leal -1(%rdx, %rdx), %eax
|
|
# else
|
|
movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
|
|
subl %ecx, %eax
|
|
# endif
|
|
ret
|
|
# endif
|
|
END(MEMCMP)
|
|
#endif
|