glibc/sysdeps/x86_64/memcmp.S
Noah Goldstein fa7f63d8d6 x86_64: Add sse2 optimized __memcmpeq in memcmp-sse2.S
No bug. This commit does not modify any of the memcmp
implementation. It just adds __memcmpeq ifdefs to skip obvious cases
where computing the proper 1/-1 required by memcmp is not needed.
2021-10-27 13:03:46 -05:00

413 lines
7.8 KiB
ArmAsm

/* memcmp with SSE2
Copyright (C) 2009-2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
.text
ENTRY (memcmp)
#ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
#endif
test %RDX_LP, %RDX_LP
jz L(finz)
cmpq $1, %rdx
jbe L(finr1b)
subq %rdi, %rsi
movq %rdx, %r10
cmpq $32, %r10
jae L(gt32)
/* Handle small chunks and last block of less than 32 bytes. */
L(small):
testq $1, %r10
jz L(s2b)
movzbl (%rdi), %eax
movzbl (%rdi, %rsi), %edx
subq $1, %r10
je L(finz1)
addq $1, %rdi
subl %edx, %eax
jnz L(exit)
L(s2b):
testq $2, %r10
jz L(s4b)
movzwl (%rdi), %eax
movzwl (%rdi, %rsi), %edx
subq $2, %r10
#ifdef USE_AS_MEMCMPEQ
je L(finz1)
#else
je L(fin2_7)
#endif
addq $2, %rdi
cmpl %edx, %eax
#ifdef USE_AS_MEMCMPEQ
jnz L(neq_early)
#else
jnz L(fin2_7)
#endif
L(s4b):
testq $4, %r10
jz L(s8b)
movl (%rdi), %eax
movl (%rdi, %rsi), %edx
subq $4, %r10
#ifdef USE_AS_MEMCMPEQ
je L(finz1)
#else
je L(fin2_7)
#endif
addq $4, %rdi
cmpl %edx, %eax
#ifdef USE_AS_MEMCMPEQ
jnz L(neq_early)
#else
jnz L(fin2_7)
#endif
L(s8b):
testq $8, %r10
jz L(s16b)
movq (%rdi), %rax
movq (%rdi, %rsi), %rdx
subq $8, %r10
#ifdef USE_AS_MEMCMPEQ
je L(sub_return8)
#else
je L(fin2_7)
#endif
addq $8, %rdi
cmpq %rdx, %rax
#ifdef USE_AS_MEMCMPEQ
jnz L(neq_early)
#else
jnz L(fin2_7)
#endif
L(s16b):
movdqu (%rdi), %xmm1
movdqu (%rdi, %rsi), %xmm0
pcmpeqb %xmm0, %xmm1
#ifdef USE_AS_MEMCMPEQ
pmovmskb %xmm1, %eax
subl $0xffff, %eax
ret
#else
pmovmskb %xmm1, %edx
xorl %eax, %eax
subl $0xffff, %edx
jz L(finz)
bsfl %edx, %ecx
leaq (%rdi, %rcx), %rcx
movzbl (%rcx), %eax
movzbl (%rsi, %rcx), %edx
jmp L(finz1)
#endif
.p2align 4,, 4
L(finr1b):
movzbl (%rdi), %eax
movzbl (%rsi), %edx
L(finz1):
subl %edx, %eax
L(exit):
ret
#ifdef USE_AS_MEMCMPEQ
.p2align 4,, 4
L(sub_return8):
subq %rdx, %rax
movl %eax, %edx
shrq $32, %rax
orl %edx, %eax
ret
#else
.p2align 4,, 4
L(fin2_7):
cmpq %rdx, %rax
jz L(finz)
movq %rax, %r11
subq %rdx, %r11
bsfq %r11, %rcx
sarq $3, %rcx
salq $3, %rcx
sarq %cl, %rax
movzbl %al, %eax
sarq %cl, %rdx
movzbl %dl, %edx
subl %edx, %eax
ret
#endif
.p2align 4,, 4
L(finz):
xorl %eax, %eax
ret
#ifdef USE_AS_MEMCMPEQ
.p2align 4,, 4
L(neq_early):
movl $1, %eax
ret
#endif
/* For blocks bigger than 32 bytes
1. Advance one of the addr pointer to be 16B aligned.
2. Treat the case of both addr pointers aligned to 16B
separately to avoid movdqu.
3. Handle any blocks of greater than 64 consecutive bytes with
unrolling to reduce branches.
4. At least one addr pointer is 16B aligned, use memory version
of pcmbeqb.
*/
.p2align 4,, 4
L(gt32):
movq %rdx, %r11
addq %rdi, %r11
movq %rdi, %r8
andq $15, %r8
jz L(16am)
/* Both pointers may be misaligned. */
movdqu (%rdi), %xmm1
movdqu (%rdi, %rsi), %xmm0
pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %edx
subl $0xffff, %edx
jnz L(neq)
neg %r8
leaq 16(%rdi, %r8), %rdi
L(16am):
/* Handle two 16B aligned pointers separately. */
testq $15, %rsi
jz L(ATR)
testq $16, %rdi
jz L(A32)
movdqu (%rdi, %rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
L(A32):
movq %r11, %r10
andq $-32, %r10
cmpq %r10, %rdi
jae L(mt16)
/* Pre-unroll to be ready for unrolled 64B loop. */
testq $32, %rdi
jz L(A64)
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
L(A64):
movq %r11, %r10
andq $-64, %r10
cmpq %r10, %rdi
jae L(mt32)
L(A64main):
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
jne L(A64main)
L(mt32):
movq %r11, %r10
andq $-32, %r10
cmpq %r10, %rdi
jae L(mt16)
L(A32main):
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
jne L(A32main)
L(mt16):
subq %rdi, %r11
je L(finz)
movq %r11, %r10
jmp L(small)
.p2align 4,, 4
L(neq):
#ifdef USE_AS_MEMCMPEQ
movl $1, %eax
ret
#else
bsfl %edx, %ecx
movzbl (%rdi, %rcx), %eax
addq %rdi, %rsi
movzbl (%rsi,%rcx), %edx
jmp L(finz1)
#endif
.p2align 4,, 4
L(ATR):
movq %r11, %r10
andq $-32, %r10
cmpq %r10, %rdi
jae L(mt16)
testq $16, %rdi
jz L(ATR32)
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
je L(mt16)
L(ATR32):
movq %r11, %r10
andq $-64, %r10
testq $32, %rdi
jz L(ATR64)
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
L(ATR64):
cmpq %rdi, %r10
je L(mt32)
L(ATR64main):
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
jne L(ATR64main)
movq %r11, %r10
andq $-32, %r10
cmpq %r10, %rdi
jae L(mt16)
L(ATR32res):
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %r10, %rdi
jne L(ATR32res)
subq %rdi, %r11
je L(finz)
movq %r11, %r10
jmp L(small)
/* Align to 16byte to improve instruction fetch. */
.p2align 4,, 4
END(memcmp)
#ifdef USE_AS_MEMCMPEQ
libc_hidden_def (memcmp)
#else
# undef bcmp
weak_alias (memcmp, bcmp)
libc_hidden_builtin_def (memcmp)
#endif