glibc/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
Noah Goldstein ceabdcd130 x86: Add support to build strcmp/strlen/strchr with explicit ISA level
1. Add default ISA level selection in non-multiarch/rtld
   implementations.

2. Add ISA level build guards to different implementations.
    - I.e strcmp-avx2.S which is ISA level 3 will only build if
      compiled ISA level <= 3. Otherwise there is no reason to
      include it as we will always use one of the ISA level 4
      implementations (strcmp-evex.S).

3. Refactor the ifunc selector and ifunc implementation list to use
   the ISA level aware wrapper macros that allow functions below the
   compiled ISA level (with a guranteed replacement) to be skipped.

Tested with and without multiarch on x86_64 for ISA levels:
{generic, x86-64-v2, x86-64-v3, x86-64-v4}

And m32 with and without multiarch.
2022-07-16 03:07:59 -07:00

222 lines
4.7 KiB
ArmAsm

/* strcmp with unaligned loads
Copyright (C) 2013-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
/* Continue building as ISA level 2. We use this as ISA V2 default
because strcmp-sse42 uses pcmpstri (slow on some SSE4.2
processors) and this implementation is potenially faster than
strcmp-sse42 (aside from the slower page cross case). */
#if ISA_SHOULD_BUILD (2)
# define STRCMP_ISA _sse2_unaligned
# include "strcmp-naming.h"
# include "sysdep.h"
ENTRY (STRCMP)
movl %edi, %eax
xorl %edx, %edx
pxor %xmm7, %xmm7
orl %esi, %eax
andl $4095, %eax
cmpl $4032, %eax
jg L(cross_page)
movdqu (%rdi), %xmm1
movdqu (%rsi), %xmm0
pcmpeqb %xmm1, %xmm0
pminub %xmm1, %xmm0
pxor %xmm1, %xmm1
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
testq %rax, %rax
je L(next_48_bytes)
L(return):
bsfq %rax, %rdx
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
ret
.p2align 4
L(next_48_bytes):
movdqu 16(%rdi), %xmm6
movdqu 16(%rsi), %xmm3
movdqu 32(%rdi), %xmm5
pcmpeqb %xmm6, %xmm3
movdqu 32(%rsi), %xmm2
pminub %xmm6, %xmm3
pcmpeqb %xmm1, %xmm3
movdqu 48(%rdi), %xmm4
pcmpeqb %xmm5, %xmm2
pmovmskb %xmm3, %edx
movdqu 48(%rsi), %xmm0
pminub %xmm5, %xmm2
pcmpeqb %xmm1, %xmm2
pcmpeqb %xmm4, %xmm0
pmovmskb %xmm2, %eax
salq $16, %rdx
pminub %xmm4, %xmm0
pcmpeqb %xmm1, %xmm0
salq $32, %rax
orq %rdx, %rax
pmovmskb %xmm0, %ecx
movq %rcx, %rdx
salq $48, %rdx
orq %rdx, %rax
jne L(return)
L(main_loop_header):
leaq 64(%rdi), %rdx
movl $4096, %ecx
pxor %xmm9, %xmm9
andq $-64, %rdx
subq %rdi, %rdx
leaq (%rdi, %rdx), %rax
addq %rsi, %rdx
movq %rdx, %rsi
andl $4095, %esi
subq %rsi, %rcx
shrq $6, %rcx
movq %rcx, %rsi
jmp L(loop_start)
.p2align 4
L(loop):
addq $64, %rax
addq $64, %rdx
L(loop_start):
testq %rsi, %rsi
leaq -1(%rsi), %rsi
je L(loop_cross_page)
L(back_to_loop):
movdqu (%rdx), %xmm0
movdqu 16(%rdx), %xmm1
movdqa (%rax), %xmm2
movdqa 16(%rax), %xmm3
pcmpeqb %xmm2, %xmm0
movdqu 32(%rdx), %xmm5
pcmpeqb %xmm3, %xmm1
pminub %xmm2, %xmm0
movdqu 48(%rdx), %xmm6
pminub %xmm3, %xmm1
movdqa 32(%rax), %xmm2
pminub %xmm1, %xmm0
movdqa 48(%rax), %xmm3
pcmpeqb %xmm2, %xmm5
pcmpeqb %xmm3, %xmm6
pminub %xmm2, %xmm5
pminub %xmm3, %xmm6
pminub %xmm5, %xmm0
pminub %xmm6, %xmm0
pcmpeqb %xmm7, %xmm0
pmovmskb %xmm0, %ecx
testl %ecx, %ecx
je L(loop)
pcmpeqb %xmm7, %xmm5
movdqu (%rdx), %xmm0
pcmpeqb %xmm7, %xmm1
movdqa (%rax), %xmm2
pcmpeqb %xmm2, %xmm0
pminub %xmm2, %xmm0
pcmpeqb %xmm7, %xmm6
pcmpeqb %xmm7, %xmm0
pmovmskb %xmm1, %ecx
pmovmskb %xmm5, %r8d
pmovmskb %xmm0, %edi
salq $16, %rcx
salq $32, %r8
pmovmskb %xmm6, %esi
orq %r8, %rcx
orq %rdi, %rcx
salq $48, %rsi
orq %rsi, %rcx
bsfq %rcx, %rcx
movzbl (%rax, %rcx), %eax
movzbl (%rdx, %rcx), %edx
subl %edx, %eax
ret
.p2align 4
L(loop_cross_page):
xor %r10, %r10
movq %rdx, %r9
and $63, %r9
subq %r9, %r10
movdqa (%rdx, %r10), %xmm0
movdqa 16(%rdx, %r10), %xmm1
movdqu (%rax, %r10), %xmm2
movdqu 16(%rax, %r10), %xmm3
pcmpeqb %xmm2, %xmm0
movdqa 32(%rdx, %r10), %xmm5
pcmpeqb %xmm3, %xmm1
pminub %xmm2, %xmm0
movdqa 48(%rdx, %r10), %xmm6
pminub %xmm3, %xmm1
movdqu 32(%rax, %r10), %xmm2
movdqu 48(%rax, %r10), %xmm3
pcmpeqb %xmm2, %xmm5
pcmpeqb %xmm3, %xmm6
pminub %xmm2, %xmm5
pminub %xmm3, %xmm6
pcmpeqb %xmm7, %xmm0
pcmpeqb %xmm7, %xmm1
pcmpeqb %xmm7, %xmm5
pcmpeqb %xmm7, %xmm6
pmovmskb %xmm1, %ecx
pmovmskb %xmm5, %r8d
pmovmskb %xmm0, %edi
salq $16, %rcx
salq $32, %r8
pmovmskb %xmm6, %esi
orq %r8, %rdi
orq %rcx, %rdi
salq $48, %rsi
orq %rsi, %rdi
movq %r9, %rcx
movq $63, %rsi
shrq %cl, %rdi
test %rdi, %rdi
je L(back_to_loop)
bsfq %rdi, %rcx
movzbl (%rax, %rcx), %eax
movzbl (%rdx, %rcx), %edx
subl %edx, %eax
ret
.p2align 4
L(cross_page_loop):
cmpb %cl, %al
jne L(different)
addq $1, %rdx
cmpq $64, %rdx
je L(main_loop_header)
L(cross_page):
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %ecx
testb %al, %al
jne L(cross_page_loop)
xorl %eax, %eax
L(different):
subl %ecx, %eax
ret
END (STRCMP)
#endif