x86: Optimize memcmp SSE2 in memcmp.S

New code save size (-303 bytes) and has significantly better
performance.

geometric_mean(N=20) of page cross cases New / Original: 0.634
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Noah Goldstein 2022-04-15 12:27:59 -05:00
parent ac0d208b54
commit 8804157ad9
8 changed files with 584 additions and 385 deletions

View File

@ -18,395 +18,557 @@
#include <sysdep.h> #include <sysdep.h>
#ifdef USE_AS_WMEMCMP
# define PCMPEQ pcmpeqd
# define CHAR_SIZE 4
# define SIZE_OFFSET (0)
#else
# define PCMPEQ pcmpeqb
# define CHAR_SIZE 1
#endif
#ifdef USE_AS_MEMCMPEQ
# define SIZE_OFFSET (0)
# define CHECK_CMP(x, y) subl x, y
#else
# ifndef SIZE_OFFSET
# define SIZE_OFFSET (CHAR_PER_VEC * 2)
# endif
# define CHECK_CMP(x, y) cmpl x, y
#endif
#define VEC_SIZE 16
#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
#ifndef MEMCMP
# define MEMCMP memcmp
#endif
.text .text
ENTRY (memcmp) ENTRY(MEMCMP)
#ifdef __ILP32__ #ifdef USE_AS_WMEMCMP
/* Clear the upper 32 bits. */ /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
movl %edx, %edx in ecx for code size. This is preferable to using `incw` as
it avoids partial register stalls on older hardware (pre
SnB). */
movl $0xffff, %ecx
#endif #endif
test %RDX_LP, %RDX_LP cmpq $CHAR_PER_VEC, %rdx
jz L(finz) ja L(more_1x_vec)
cmpq $1, %rdx
jbe L(finr1b) #ifdef USE_AS_WMEMCMP
subq %rdi, %rsi /* saves a byte of code keeping the fall through path n = [2, 4]
movq %rdx, %r10 in the initial cache line. */
cmpq $32, %r10 decl %edx
jae L(gt32) jle L(cmp_0_1)
/* Handle small chunks and last block of less than 32 bytes. */
L(small): movq (%rsi), %xmm0
testq $1, %r10 movq (%rdi), %xmm1
jz L(s2b) PCMPEQ %xmm0, %xmm1
movzbl (%rdi), %eax
movzbl (%rdi, %rsi), %edx
subq $1, %r10
je L(finz1)
addq $1, %rdi
subl %edx, %eax
jnz L(exit)
L(s2b):
testq $2, %r10
jz L(s4b)
movzwl (%rdi), %eax
movzwl (%rdi, %rsi), %edx
subq $2, %r10
#ifdef USE_AS_MEMCMPEQ
je L(finz1)
#else
je L(fin2_7)
#endif
addq $2, %rdi
cmpl %edx, %eax
#ifdef USE_AS_MEMCMPEQ
jnz L(neq_early)
#else
jnz L(fin2_7)
#endif
L(s4b):
testq $4, %r10
jz L(s8b)
movl (%rdi), %eax
movl (%rdi, %rsi), %edx
subq $4, %r10
#ifdef USE_AS_MEMCMPEQ
je L(finz1)
#else
je L(fin2_7)
#endif
addq $4, %rdi
cmpl %edx, %eax
#ifdef USE_AS_MEMCMPEQ
jnz L(neq_early)
#else
jnz L(fin2_7)
#endif
L(s8b):
testq $8, %r10
jz L(s16b)
movq (%rdi), %rax
movq (%rdi, %rsi), %rdx
subq $8, %r10
#ifdef USE_AS_MEMCMPEQ
je L(sub_return8)
#else
je L(fin2_7)
#endif
addq $8, %rdi
cmpq %rdx, %rax
#ifdef USE_AS_MEMCMPEQ
jnz L(neq_early)
#else
jnz L(fin2_7)
#endif
L(s16b):
movdqu (%rdi), %xmm1
movdqu (%rdi, %rsi), %xmm0
pcmpeqb %xmm0, %xmm1
#ifdef USE_AS_MEMCMPEQ
pmovmskb %xmm1, %eax pmovmskb %xmm1, %eax
subl $0xffff, %eax subl %ecx, %eax
ret jnz L(ret_nonzero_vec_start_0)
movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
jnz L(ret_nonzero_vec_end_0_adj)
#else #else
pmovmskb %xmm1, %edx cmpl $8, %edx
ja L(cmp_9_16)
cmpl $4, %edx
jb L(cmp_0_3)
# ifdef USE_AS_MEMCMPEQ
movl (%rsi), %eax
subl (%rdi), %eax
movl -4(%rsi, %rdx), %esi
subl -4(%rdi, %rdx), %esi
orl %esi, %eax
ret
# else
/* Combine comparisons for lo and hi 4-byte comparisons. */
movl -4(%rsi, %rdx), %ecx
movl -4(%rdi, %rdx), %eax
shlq $32, %rcx
shlq $32, %rax
movl (%rsi), %esi
movl (%rdi), %edi
orq %rsi, %rcx
orq %rdi, %rax
/* Only compute proper return if not-equal. */
cmpq %rcx, %rax
jnz L(ret_nonzero)
xorl %eax, %eax xorl %eax, %eax
subl $0xffff, %edx ret
jz L(finz) # endif
bsfl %edx, %ecx
leaq (%rdi, %rcx), %rcx .p2align 4,, 10
movzbl (%rcx), %eax L(cmp_9_16):
movzbl (%rsi, %rcx), %edx # ifdef USE_AS_MEMCMPEQ
jmp L(finz1) movq (%rsi), %rax
subq (%rdi), %rax
movq -8(%rsi, %rdx), %rcx
subq -8(%rdi, %rdx), %rcx
orq %rcx, %rax
/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
return long). */
setnz %cl
movzbl %cl, %eax
# else
movq (%rsi), %rcx
movq (%rdi), %rax
/* Only compute proper return if not-equal. */
cmpq %rcx, %rax
jnz L(ret_nonzero)
movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
movq -8(%rdi, %rdx, CHAR_SIZE), %rax
/* Only compute proper return if not-equal. */
cmpq %rcx, %rax
jnz L(ret_nonzero)
xorl %eax, %eax
# endif
#endif #endif
.p2align 4,, 4 ret
L(finr1b):
.p2align 4,, 8
L(cmp_0_1):
/* Flag set by earlier comparison against 1. */
jne L(cmp_0_0)
#ifdef USE_AS_WMEMCMP
movl (%rdi), %ecx
xorl %edx, %edx
cmpl (%rsi), %ecx
je L(cmp_0_0)
setg %dl
leal -1(%rdx, %rdx), %eax
#else
movzbl (%rdi), %eax movzbl (%rdi), %eax
movzbl (%rsi), %edx movzbl (%rsi), %ecx
L(finz1): subl %ecx, %eax
subl %edx, %eax
L(exit):
ret
#ifdef USE_AS_MEMCMPEQ
.p2align 4,, 4
L(sub_return8):
subq %rdx, %rax
movl %eax, %edx
shrq $32, %rax
orl %edx, %eax
ret
#else
.p2align 4,, 4
L(fin2_7):
cmpq %rdx, %rax
jz L(finz)
movq %rax, %r11
subq %rdx, %r11
bsfq %r11, %rcx
sarq $3, %rcx
salq $3, %rcx
sarq %cl, %rax
movzbl %al, %eax
sarq %cl, %rdx
movzbl %dl, %edx
subl %edx, %eax
ret
#endif #endif
.p2align 4,, 4 ret
L(finz):
/* Fits in aligning bytes. */
L(cmp_0_0):
xorl %eax, %eax xorl %eax, %eax
ret ret
#ifdef USE_AS_MEMCMPEQ
.p2align 4,, 4 #ifdef USE_AS_WMEMCMP
L(neq_early): .p2align 4
movl $1, %eax L(ret_nonzero_vec_start_0):
bsfl %eax, %eax
movl (%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
ret ret
#endif #else
/* For blocks bigger than 32 bytes
1. Advance one of the addr pointer to be 16B aligned. # ifndef USE_AS_MEMCMPEQ
2. Treat the case of both addr pointers aligned to 16B .p2align 4,, 14
separately to avoid movdqu. L(ret_nonzero):
3. Handle any blocks of greater than 64 consecutive bytes with /* Need to bswap to get proper return without branch. */
unrolling to reduce branches. bswapq %rcx
4. At least one addr pointer is 16B aligned, use memory version bswapq %rax
of pcmbeqb. subq %rcx, %rax
sbbl %eax, %eax
orl $1, %eax
ret
# endif
.p2align 4
L(cmp_0_3):
# ifdef USE_AS_MEMCMPEQ
/* No reason to add to dependency chain on rdx. Saving a the
bytes here doesn't change number of fetch blocks. */
cmpl $1, %edx
jbe L(cmp_0_1)
# else
/* We need the code size to prevent taking an extra fetch block.
*/ */
.p2align 4,, 4 decl %edx
L(gt32): jle L(cmp_0_1)
movq %rdx, %r11 # endif
addq %rdi, %r11 movzwl (%rsi), %ecx
movq %rdi, %r8 movzwl (%rdi), %eax
andq $15, %r8 # ifdef USE_AS_MEMCMPEQ
jz L(16am) subl %ecx, %eax
/* Both pointers may be misaligned. */
movdqu (%rdi), %xmm1
movdqu (%rdi, %rsi), %xmm0
pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %edx
subl $0xffff, %edx
jnz L(neq)
neg %r8
leaq 16(%rdi, %r8), %rdi
L(16am):
/* Handle two 16B aligned pointers separately. */
testq $15, %rsi
jz L(ATR)
testq $16, %rdi
jz L(A32)
movdqu (%rdi, %rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
L(A32):
movq %r11, %r10
andq $-32, %r10
cmpq %r10, %rdi
jae L(mt16)
/* Pre-unroll to be ready for unrolled 64B loop. */
testq $32, %rdi
jz L(A64)
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0 movzbl -1(%rsi, %rdx), %esi
pcmpeqb (%rdi), %xmm0 movzbl -1(%rdi, %rdx), %edi
pmovmskb %xmm0, %edx subl %edi, %esi
subl $0xffff, %edx orl %esi, %eax
jnz L(neq) # else
addq $16, %rdi bswapl %ecx
bswapl %eax
L(A64): /* Implicit right shift by one. We just need to displace the
movq %r11, %r10 sign bits. */
andq $-64, %r10 shrl %ecx
cmpq %r10, %rdi shrl %eax
jae L(mt32)
L(A64main): /* Eat a partial register stall here. Saves code stopping
movdqu (%rdi,%rsi), %xmm0 L(cmp_0_3) from bleeding into the next fetch block and saves
pcmpeqb (%rdi), %xmm0 an ALU. */
pmovmskb %xmm0, %edx movb (%rsi, %rdx), %cl
subl $0xffff, %edx movzbl (%rdi, %rdx), %edi
jnz L(neq) orl %edi, %eax
addq $16, %rdi subl %ecx, %eax
# endif
ret
#endif
movdqu (%rdi,%rsi), %xmm0 .p2align 5
pcmpeqb (%rdi), %xmm0 L(more_1x_vec):
pmovmskb %xmm0, %edx #ifndef USE_AS_WMEMCMP
subl $0xffff, %edx /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
jnz L(neq) in ecx for code size. This is preferable to using `incw` as
addq $16, %rdi it avoids partial register stalls on older hardware (pre
SnB). */
movl $0xffff, %ecx
#endif
movups (%rsi), %xmm0
movups (%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
jnz L(ret_nonzero_vec_start_0)
#if SIZE_OFFSET == 0
cmpq $(CHAR_PER_VEC * 2), %rdx
#else
/* Offset rdx. Saves just enough code size to keep the
L(last_2x_vec) case and the non-zero return in a single
cache line. */
subq $(CHAR_PER_VEC * 2), %rdx
#endif
ja L(more_2x_vec)
movdqu (%rdi,%rsi), %xmm0 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
pcmpeqb (%rdi), %xmm0 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
pmovmskb %xmm0, %edx PCMPEQ %xmm0, %xmm1
subl $0xffff, %edx pmovmskb %xmm1, %eax
jnz L(neq) subl %ecx, %eax
addq $16, %rdi #ifndef USE_AS_MEMCMPEQ
/* Don't use `incw ax` as machines this code runs on are liable
to have partial register stall. */
jnz L(ret_nonzero_vec_end_0)
#else
/* Various return targets for memcmpeq. Will always be hot in
Icache and get short encoding. */
L(ret_nonzero_vec_start_1):
L(ret_nonzero_vec_start_0):
L(ret_nonzero_vec_end_0):
#endif
ret
movdqu (%rdi,%rsi), %xmm0 #ifndef USE_AS_MEMCMPEQ
pcmpeqb (%rdi), %xmm0 # ifdef USE_AS_WMEMCMP
pmovmskb %xmm0, %edx .p2align 4
subl $0xffff, %edx L(ret_nonzero_vec_end_0_adj):
jnz L(neq) addl $3, %edx
addq $16, %rdi # else
.p2align 4,, 8
# endif
L(ret_nonzero_vec_end_0):
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
leal (%rax, %rdx, CHAR_SIZE), %eax
movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
addl %edx, %eax
movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
# ifndef USE_AS_WMEMCMP
.p2align 4,, 10
L(ret_nonzero_vec_start_0):
bsfl %eax, %eax
movzbl (%rsi, %rax), %ecx
movzbl (%rdi, %rax), %eax
subl %ecx, %eax
ret
# endif
#else
#endif
cmpq %rdi, %r10 .p2align 5
jne L(A64main) L(more_2x_vec):
movups (VEC_SIZE * 1)(%rsi), %xmm0
movups (VEC_SIZE * 1)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
pmovmskb %xmm1, %eax
subl %ecx, %eax
jnz L(ret_nonzero_vec_start_1)
L(mt32): cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
movq %r11, %r10 jbe L(last_2x_vec)
andq $-32, %r10
cmpq %r10, %rdi
jae L(mt16)
L(A32main): cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
movdqu (%rdi,%rsi), %xmm0 ja L(more_8x_vec)
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0 /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
pcmpeqb (%rdi), %xmm0 This can harm performance if non-zero return in [65, 80] or
pmovmskb %xmm0, %edx [97, 112] but helps performance otherwise. Generally zero-
subl $0xffff, %edx return is hotter. */
jnz L(neq) movups (VEC_SIZE * 2)(%rsi), %xmm0
addq $16, %rdi movups (VEC_SIZE * 2)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * 3)(%rsi), %xmm2
movups (VEC_SIZE * 3)(%rdi), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
cmpq %rdi, %r10 pmovmskb %xmm3, %eax
jne L(A32main) CHECK_CMP (%ecx, %eax)
L(mt16): jnz L(ret_nonzero_vec_start_2_3)
subq %rdi, %r11
je L(finz)
movq %r11, %r10
jmp L(small)
.p2align 4,, 4 cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
L(neq): jbe L(last_2x_vec)
movups (VEC_SIZE * 4)(%rsi), %xmm0
movups (VEC_SIZE * 4)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * 5)(%rsi), %xmm2
movups (VEC_SIZE * 5)(%rdi), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
CHECK_CMP (%ecx, %eax)
#ifdef USE_AS_MEMCMPEQ #ifdef USE_AS_MEMCMPEQ
movl $1, %eax jz L(last_2x_vec)
ret ret
#else #else
bsfl %edx, %ecx jnz L(ret_nonzero_vec_start_4_5)
movzbl (%rdi, %rcx), %eax
addq %rdi, %rsi
movzbl (%rsi,%rcx), %edx
jmp L(finz1)
#endif #endif
.p2align 4
.p2align 4,, 4 L(last_2x_vec):
L(ATR): movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
movq %r11, %r10 movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
andq $-32, %r10 PCMPEQ %xmm0, %xmm1
cmpq %r10, %rdi movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
jae L(mt16) movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
testq $16, %rdi PCMPEQ %xmm2, %xmm3
jz L(ATR32) pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
movdqa (%rdi,%rsi), %xmm0 subl %ecx, %eax
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
je L(mt16)
L(ATR32):
movq %r11, %r10
andq $-64, %r10
testq $32, %rdi
jz L(ATR64)
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
L(ATR64):
cmpq %rdi, %r10
je L(mt32)
L(ATR64main):
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
jne L(ATR64main)
movq %r11, %r10
andq $-32, %r10
cmpq %r10, %rdi
jae L(mt16)
L(ATR32res):
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %r10, %rdi
jne L(ATR32res)
subq %rdi, %r11
je L(finz)
movq %r11, %r10
jmp L(small)
/* Align to 16byte to improve instruction fetch. */
.p2align 4,, 4
END(memcmp)
#ifdef USE_AS_MEMCMPEQ #ifdef USE_AS_MEMCMPEQ
libc_hidden_def (memcmp) /* Various return targets for memcmpeq. Will always be hot in
Icache and get short encoding. */
L(ret_nonzero_vec_start_2_3):
L(ret_nonzero_vec_start_4_5):
ret
#else #else
# undef bcmp jnz L(ret_nonzero_vec_end_1)
weak_alias (memcmp, bcmp) ret
libc_hidden_builtin_def (memcmp)
.p2align 4,, 8
L(ret_nonzero_vec_end_1):
pmovmskb %xmm1, %ecx
/* High 16 bits of eax guranteed to be all ones. Rotate them in
to we can do `or + not` with just `xor`. */
rorl $16, %eax
xorl %ecx, %eax
/* Partial register stall. */
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
leal (%rax, %rdx, CHAR_SIZE), %eax
movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
addl %edx, %eax
movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
.p2align 4
L(ret_nonzero_vec_start_4_5):
pmovmskb %xmm1, %edx
sall $16, %eax
leal 1(%rax, %rdx), %eax
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
.p2align 4,, 8
L(ret_nonzero_vec_start_1):
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
#endif
.p2align 4
L(more_8x_vec):
subq %rdi, %rsi
leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
andq $(VEC_SIZE * -1), %rdi
addq %rdi, %rsi
.p2align 4
L(loop_4x):
movups (VEC_SIZE * 2)(%rsi), %xmm0
movups (VEC_SIZE * 3)(%rsi), %xmm1
PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
movups (VEC_SIZE * 4)(%rsi), %xmm2
movups (VEC_SIZE * 5)(%rsi), %xmm3
PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
pand %xmm0, %xmm1
pand %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
subl %ecx, %eax
jnz L(ret_nonzero_loop)
addq $(VEC_SIZE * 4), %rdi
addq $(VEC_SIZE * 4), %rsi
cmpq %rdi, %rdx
ja L(loop_4x)
/* Get remaining length in edx. */
subl %edi, %edx
/* Restore offset so we can reuse L(last_2x_vec). */
addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
#ifdef USE_AS_WMEMCMP
shrl $2, %edx
#endif
cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
jbe L(last_2x_vec)
movups (VEC_SIZE * 2)(%rsi), %xmm0
movups (VEC_SIZE * 2)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * 3)(%rsi), %xmm2
movups (VEC_SIZE * 3)(%rdi), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
CHECK_CMP (%ecx, %eax)
jz L(last_2x_vec)
#ifdef USE_AS_MEMCMPEQ
L(ret_nonzero_loop):
ret
#else
.p2align 4
L(ret_nonzero_vec_start_2_3):
pmovmskb %xmm1, %edx
sall $16, %eax
leal 1(%rax, %rdx), %eax
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
.p2align 4
L(ret_nonzero_loop):
pmovmskb %xmm0, %ecx
pmovmskb %xmm1, %edx
sall $(VEC_SIZE * 1), %edx
leal 1(%rcx, %rdx), %edx
pmovmskb %xmm2, %ecx
/* High 16 bits of eax guranteed to be all ones. Rotate them in
to we can do `or + not` with just `xor`. */
rorl $16, %eax
xorl %ecx, %eax
salq $32, %rax
orq %rdx, %rax
bsfq %rax, %rax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
#endif
END(MEMCMP)
#ifndef USE_AS_WMEMCMP
# ifdef USE_AS_MEMCMPEQ
libc_hidden_def (MEMCMP)
# else
# undef bcmp
weak_alias (MEMCMP, bcmp)
libc_hidden_builtin_def (MEMCMP)
# endif
#endif #endif

View File

@ -16,6 +16,6 @@
License along with the GNU C Library; if not, see License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */ <https://www.gnu.org/licenses/>. */
#define memcmp __memcmpeq #define MEMCMP __memcmpeq
#define USE_AS_MEMCMPEQ 1 #define USE_AS_MEMCMPEQ 1
#include "multiarch/memcmp-sse2.S" #include "multiarch/memcmp-sse2.S"

View File

@ -162,8 +162,8 @@ sysdep_routines += \
wmemchr-sse2 \ wmemchr-sse2 \
wmemcmp-avx2-movbe \ wmemcmp-avx2-movbe \
wmemcmp-avx2-movbe-rtm \ wmemcmp-avx2-movbe-rtm \
wmemcmp-c \
wmemcmp-evex-movbe \ wmemcmp-evex-movbe \
wmemcmp-sse2 \
wmemcmp-sse4 \ wmemcmp-sse4 \
# sysdep_routines # sysdep_routines
endif endif

View File

@ -17,8 +17,8 @@
<https://www.gnu.org/licenses/>. */ <https://www.gnu.org/licenses/>. */
#if IS_IN (libc) #if IS_IN (libc)
# ifndef memcmp # ifndef MEMCMP
# define memcmp __memcmp_sse2 # define MEMCMP __memcmp_sse2
# endif # endif
# ifdef SHARED # ifdef SHARED

View File

@ -17,9 +17,9 @@
<https://www.gnu.org/licenses/>. */ <https://www.gnu.org/licenses/>. */
#if IS_IN (libc) #if IS_IN (libc)
# define memcmp __memcmpeq_sse2 # define MEMCMP __memcmpeq_sse2
#else #else
# define memcmp __memcmpeq # define MEMCMP __memcmpeq
#endif #endif
#define USE_AS_MEMCMPEQ 1 #define USE_AS_MEMCMPEQ 1
#include "memcmp-sse2.S" #include "memcmp-sse2.S"

View File

@ -1,9 +0,0 @@
#if IS_IN (libc)
# include <wchar.h>
# define WMEMCMP __wmemcmp_sse2
extern __typeof (wmemcmp) __wmemcmp_sse2;
#endif
#include "wcsmbs/wmemcmp.c"

View File

@ -0,0 +1,25 @@
/* wmemcmp optimized with SSE2.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
# define MEMCMP __wmemcmp_sse2
#else
# define MEMCMP wmemcmp
#endif
#define USE_AS_WMEMCMP 1
#include "memcmp-sse2.S"

21
sysdeps/x86_64/wmemcmp.S Normal file
View File

@ -0,0 +1,21 @@
/* wmemcmp optimized with SSE2.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define MEMCMP wmemcmp
#define USE_AS_WMEMCMP 1
#include "multiarch/memcmp-sse2.S"