glibc/sysdeps/i386/i686/multiarch/memrchr-sse2.S
Paul Eggert 5a82c74822 Prefer https to http for gnu.org and fsf.org URLs
Also, change sources.redhat.com to sourceware.org.
This patch was automatically generated by running the following shell
script, which uses GNU sed, and which avoids modifying files imported
from upstream:

sed -ri '
  s,(http|ftp)(://(.*\.)?(gnu|fsf|sourceware)\.org($|[^.]|\.[^a-z])),https\2,g
  s,(http|ftp)(://(.*\.)?)sources\.redhat\.com($|[^.]|\.[^a-z]),https\2sourceware.org\4,g
' \
  $(find $(git ls-files) -prune -type f \
      ! -name '*.po' \
      ! -name 'ChangeLog*' \
      ! -path COPYING ! -path COPYING.LIB \
      ! -path manual/fdl-1.3.texi ! -path manual/lgpl-2.1.texi \
      ! -path manual/texinfo.tex ! -path scripts/config.guess \
      ! -path scripts/config.sub ! -path scripts/install-sh \
      ! -path scripts/mkinstalldirs ! -path scripts/move-if-change \
      ! -path INSTALL ! -path  locale/programs/charmap-kw.h \
      ! -path po/libc.pot ! -path sysdeps/gnu/errlist.c \
      ! '(' -name configure \
            -execdir test -f configure.ac -o -f configure.in ';' ')' \
      ! '(' -name preconfigure \
            -execdir test -f preconfigure.ac ';' ')' \
      -print)

and then by running 'make dist-prepare' to regenerate files built
from the altered files, and then executing the following to cleanup:

  chmod a+x sysdeps/unix/sysv/linux/riscv/configure
  # Omit irrelevant whitespace and comment-only changes,
  # perhaps from a slightly-different Autoconf version.
  git checkout -f \
    sysdeps/csky/configure \
    sysdeps/hppa/configure \
    sysdeps/riscv/configure \
    sysdeps/unix/sysv/linux/csky/configure
  # Omit changes that caused a pre-commit check to fail like this:
  # remote: *** error: sysdeps/powerpc/powerpc64/ppc-mcount.S: trailing lines
  git checkout -f \
    sysdeps/powerpc/powerpc64/ppc-mcount.S \
    sysdeps/unix/sysv/linux/s390/s390-64/syscall.S
  # Omit change that caused a pre-commit check to fail like this:
  # remote: *** error: sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S: last line does not end in newline
  git checkout -f sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S
2019-09-07 02:43:31 -07:00

725 lines
11 KiB
ArmAsm

/* Optimized memrchr with sse2 without bsf
Copyright (C) 2011-2019 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
# include <sysdep.h>
# define CFI_PUSH(REG) \
cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0)
# define CFI_POP(REG) \
cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
# define PUSH(REG) pushl REG; CFI_PUSH (REG)
# define POP(REG) popl REG; CFI_POP (REG)
# define PARMS 4
# define STR1 PARMS
# define STR2 STR1+4
# define LEN STR2+4
atom_text_section
ENTRY (__memrchr_sse2)
mov STR1(%esp), %ecx
movd STR2(%esp), %xmm1
mov LEN(%esp), %edx
sub $16, %edx
jbe L(length_less16)
punpcklbw %xmm1, %xmm1
add %edx, %ecx
punpcklbw %xmm1, %xmm1
movdqu (%ecx), %xmm0
pshufd $0, %xmm1, %xmm1
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(exit_dispatch)
sub $64, %ecx
mov %ecx, %eax
and $15, %eax
jz L(loop_prolog)
lea 16(%ecx), %ecx
lea 16(%edx), %edx
sub %eax, %edx
and $-16, %ecx
.p2align 4
/* Loop start on aligned string. */
L(loop_prolog):
sub $64, %edx
jbe L(exit_loop)
movdqa 48(%ecx), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches48)
movdqa 32(%ecx), %xmm2
pcmpeqb %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches32)
movdqa 16(%ecx), %xmm3
pcmpeqb %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches16)
movdqa (%ecx), %xmm4
pcmpeqb %xmm1, %xmm4
pmovmskb %xmm4, %eax
test %eax, %eax
jnz L(exit_dispatch)
sub $64, %ecx
sub $64, %edx
jbe L(exit_loop)
movdqa 48(%ecx), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches48)
movdqa 32(%ecx), %xmm2
pcmpeqb %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches32)
movdqa 16(%ecx), %xmm3
pcmpeqb %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches16)
movdqa (%ecx), %xmm3
pcmpeqb %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(exit_dispatch)
mov %ecx, %eax
and $63, %eax
test %eax, %eax
jz L(align64_loop)
lea 64(%ecx), %ecx
lea 64(%edx), %edx
and $-64, %ecx
sub %eax, %edx
.p2align 4
L(align64_loop):
sub $64, %ecx
sub $64, %edx
jbe L(exit_loop)
movdqa (%ecx), %xmm0
movdqa 16(%ecx), %xmm2
movdqa 32(%ecx), %xmm3
movdqa 48(%ecx), %xmm4
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm1, %xmm2
pcmpeqb %xmm1, %xmm3
pcmpeqb %xmm1, %xmm4
pmaxub %xmm3, %xmm0
pmaxub %xmm4, %xmm2
pmaxub %xmm0, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jz L(align64_loop)
pmovmskb %xmm4, %eax
test %eax, %eax
jnz L(matches48)
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32)
movdqa 16(%ecx), %xmm2
pcmpeqb %xmm1, %xmm2
pcmpeqb (%ecx), %xmm1
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches16)
pmovmskb %xmm1, %eax
test %ah, %ah
jnz L(exit_dispatch_high)
mov %al, %dl
and $15 << 4, %dl
jnz L(exit_dispatch_8)
test $0x08, %al
jnz L(exit_4)
test $0x04, %al
jnz L(exit_3)
test $0x02, %al
jnz L(exit_2)
mov %ecx, %eax
ret
.p2align 4
L(exit_loop):
add $64, %edx
cmp $32, %edx
jbe L(exit_loop_32)
movdqa 48(%ecx), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches48)
movdqa 32(%ecx), %xmm2
pcmpeqb %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches32)
movdqa 16(%ecx), %xmm3
pcmpeqb %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches16_1)
cmp $48, %edx
jbe L(return_null)
pcmpeqb (%ecx), %xmm1
pmovmskb %xmm1, %eax
test %eax, %eax
jnz L(matches0_1)
xor %eax, %eax
ret
.p2align 4
L(exit_loop_32):
movdqa 48(%ecx), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches48_1)
cmp $16, %edx
jbe L(return_null)
pcmpeqb 32(%ecx), %xmm1
pmovmskb %xmm1, %eax
test %eax, %eax
jnz L(matches32_1)
xor %eax, %eax
ret
.p2align 4
L(matches16):
lea 16(%ecx), %ecx
test %ah, %ah
jnz L(exit_dispatch_high)
mov %al, %dl
and $15 << 4, %dl
jnz L(exit_dispatch_8)
test $0x08, %al
jnz L(exit_4)
test $0x04, %al
jnz L(exit_3)
test $0x02, %al
jnz L(exit_2)
mov %ecx, %eax
ret
.p2align 4
L(matches32):
lea 32(%ecx), %ecx
test %ah, %ah
jnz L(exit_dispatch_high)
mov %al, %dl
and $15 << 4, %dl
jnz L(exit_dispatch_8)
test $0x08, %al
jnz L(exit_4)
test $0x04, %al
jnz L(exit_3)
test $0x02, %al
jnz L(exit_2)
mov %ecx, %eax
ret
.p2align 4
L(matches48):
lea 48(%ecx), %ecx
.p2align 4
L(exit_dispatch):
test %ah, %ah
jnz L(exit_dispatch_high)
mov %al, %dl
and $15 << 4, %dl
jnz L(exit_dispatch_8)
test $0x08, %al
jnz L(exit_4)
test $0x04, %al
jnz L(exit_3)
test $0x02, %al
jnz L(exit_2)
mov %ecx, %eax
ret
.p2align 4
L(exit_dispatch_8):
test $0x80, %al
jnz L(exit_8)
test $0x40, %al
jnz L(exit_7)
test $0x20, %al
jnz L(exit_6)
lea 4(%ecx), %eax
ret
.p2align 4
L(exit_dispatch_high):
mov %ah, %dh
and $15 << 4, %dh
jnz L(exit_dispatch_high_8)
test $0x08, %ah
jnz L(exit_12)
test $0x04, %ah
jnz L(exit_11)
test $0x02, %ah
jnz L(exit_10)
lea 8(%ecx), %eax
ret
.p2align 4
L(exit_dispatch_high_8):
test $0x80, %ah
jnz L(exit_16)
test $0x40, %ah
jnz L(exit_15)
test $0x20, %ah
jnz L(exit_14)
lea 12(%ecx), %eax
ret
.p2align 4
L(exit_2):
lea 1(%ecx), %eax
ret
.p2align 4
L(exit_3):
lea 2(%ecx), %eax
ret
.p2align 4
L(exit_4):
lea 3(%ecx), %eax
ret
.p2align 4
L(exit_6):
lea 5(%ecx), %eax
ret
.p2align 4
L(exit_7):
lea 6(%ecx), %eax
ret
.p2align 4
L(exit_8):
lea 7(%ecx), %eax
ret
.p2align 4
L(exit_10):
lea 9(%ecx), %eax
ret
.p2align 4
L(exit_11):
lea 10(%ecx), %eax
ret
.p2align 4
L(exit_12):
lea 11(%ecx), %eax
ret
.p2align 4
L(exit_14):
lea 13(%ecx), %eax
ret
.p2align 4
L(exit_15):
lea 14(%ecx), %eax
ret
.p2align 4
L(exit_16):
lea 15(%ecx), %eax
ret
.p2align 4
L(matches0_1):
lea -64(%edx), %edx
test %ah, %ah
jnz L(exit_dispatch_1_high)
mov %al, %ah
and $15 << 4, %ah
jnz L(exit_dispatch_1_8)
test $0x08, %al
jnz L(exit_1_4)
test $0x04, %al
jnz L(exit_1_3)
test $0x02, %al
jnz L(exit_1_2)
add $0, %edx
jl L(return_null)
mov %ecx, %eax
ret
.p2align 4
L(matches16_1):
lea -48(%edx), %edx
lea 16(%ecx), %ecx
test %ah, %ah
jnz L(exit_dispatch_1_high)
mov %al, %ah
and $15 << 4, %ah
jnz L(exit_dispatch_1_8)
test $0x08, %al
jnz L(exit_1_4)
test $0x04, %al
jnz L(exit_1_3)
test $0x02, %al
jnz L(exit_1_2)
add $0, %edx
jl L(return_null)
mov %ecx, %eax
ret
.p2align 4
L(matches32_1):
lea -32(%edx), %edx
lea 32(%ecx), %ecx
test %ah, %ah
jnz L(exit_dispatch_1_high)
mov %al, %ah
and $15 << 4, %ah
jnz L(exit_dispatch_1_8)
test $0x08, %al
jnz L(exit_1_4)
test $0x04, %al
jnz L(exit_1_3)
test $0x02, %al
jnz L(exit_1_2)
add $0, %edx
jl L(return_null)
mov %ecx, %eax
ret
.p2align 4
L(matches48_1):
lea -16(%edx), %edx
lea 48(%ecx), %ecx
.p2align 4
L(exit_dispatch_1):
test %ah, %ah
jnz L(exit_dispatch_1_high)
mov %al, %ah
and $15 << 4, %ah
jnz L(exit_dispatch_1_8)
test $0x08, %al
jnz L(exit_1_4)
test $0x04, %al
jnz L(exit_1_3)
test $0x02, %al
jnz L(exit_1_2)
add $0, %edx
jl L(return_null)
mov %ecx, %eax
ret
.p2align 4
L(exit_dispatch_1_8):
test $0x80, %al
jnz L(exit_1_8)
test $0x40, %al
jnz L(exit_1_7)
test $0x20, %al
jnz L(exit_1_6)
add $4, %edx
jl L(return_null)
lea 4(%ecx), %eax
ret
.p2align 4
L(exit_dispatch_1_high):
mov %ah, %al
and $15 << 4, %al
jnz L(exit_dispatch_1_high_8)
test $0x08, %ah
jnz L(exit_1_12)
test $0x04, %ah
jnz L(exit_1_11)
test $0x02, %ah
jnz L(exit_1_10)
add $8, %edx
jl L(return_null)
lea 8(%ecx), %eax
ret
.p2align 4
L(exit_dispatch_1_high_8):
test $0x80, %ah
jnz L(exit_1_16)
test $0x40, %ah
jnz L(exit_1_15)
test $0x20, %ah
jnz L(exit_1_14)
add $12, %edx
jl L(return_null)
lea 12(%ecx), %eax
ret
.p2align 4
L(exit_1_2):
add $1, %edx
jl L(return_null)
lea 1(%ecx), %eax
ret
.p2align 4
L(exit_1_3):
add $2, %edx
jl L(return_null)
lea 2(%ecx), %eax
ret
.p2align 4
L(exit_1_4):
add $3, %edx
jl L(return_null)
lea 3(%ecx), %eax
ret
.p2align 4
L(exit_1_6):
add $5, %edx
jl L(return_null)
lea 5(%ecx), %eax
ret
.p2align 4
L(exit_1_7):
add $6, %edx
jl L(return_null)
lea 6(%ecx), %eax
ret
.p2align 4
L(exit_1_8):
add $7, %edx
jl L(return_null)
lea 7(%ecx), %eax
ret
.p2align 4
L(exit_1_10):
add $9, %edx
jl L(return_null)
lea 9(%ecx), %eax
ret
.p2align 4
L(exit_1_11):
add $10, %edx
jl L(return_null)
lea 10(%ecx), %eax
ret
.p2align 4
L(exit_1_12):
add $11, %edx
jl L(return_null)
lea 11(%ecx), %eax
ret
.p2align 4
L(exit_1_14):
add $13, %edx
jl L(return_null)
lea 13(%ecx), %eax
ret
.p2align 4
L(exit_1_15):
add $14, %edx
jl L(return_null)
lea 14(%ecx), %eax
ret
.p2align 4
L(exit_1_16):
add $15, %edx
jl L(return_null)
lea 15(%ecx), %eax
ret
.p2align 4
L(return_null):
xor %eax, %eax
ret
.p2align 4
L(length_less16_offset0):
mov %dl, %cl
pcmpeqb (%eax), %xmm1
mov $1, %edx
sal %cl, %edx
sub $1, %edx
mov %eax, %ecx
pmovmskb %xmm1, %eax
and %edx, %eax
test %eax, %eax
jnz L(exit_dispatch)
xor %eax, %eax
ret
.p2align 4
L(length_less16):
punpcklbw %xmm1, %xmm1
add $16, %edx
je L(return_null)
punpcklbw %xmm1, %xmm1
mov %ecx, %eax
pshufd $0, %xmm1, %xmm1
and $15, %ecx
jz L(length_less16_offset0)
PUSH (%edi)
mov %cl, %dh
add %dl, %dh
and $-16, %eax
sub $16, %dh
ja L(length_less16_part2)
pcmpeqb (%eax), %xmm1
pmovmskb %xmm1, %edi
sar %cl, %edi
add %ecx, %eax
mov %dl, %cl
mov $1, %edx
sal %cl, %edx
sub $1, %edx
and %edx, %edi
test %edi, %edi
jz L(ret_null)
bsr %edi, %edi
add %edi, %eax
POP (%edi)
ret
CFI_PUSH (%edi)
.p2align 4
L(length_less16_part2):
movdqa 16(%eax), %xmm2
pcmpeqb %xmm1, %xmm2
pmovmskb %xmm2, %edi
mov %cl, %ch
mov %dh, %cl
mov $1, %edx
sal %cl, %edx
sub $1, %edx
and %edx, %edi
test %edi, %edi
jnz L(length_less16_part2_return)
pcmpeqb (%eax), %xmm1
pmovmskb %xmm1, %edi
mov %ch, %cl
sar %cl, %edi
test %edi, %edi
jz L(ret_null)
bsr %edi, %edi
add %edi, %eax
xor %ch, %ch
add %ecx, %eax
POP (%edi)
ret
CFI_PUSH (%edi)
.p2align 4
L(length_less16_part2_return):
bsr %edi, %edi
lea 16(%eax, %edi), %eax
POP (%edi)
ret
CFI_PUSH (%edi)
.p2align 4
L(ret_null):
xor %eax, %eax
POP (%edi)
ret
END (__memrchr_sse2)
#endif