2017-10-06 15:59:32 +00:00
|
|
|
/* memchr implemented using NEON.
|
2022-01-01 18:54:23 +00:00
|
|
|
Copyright (C) 2011-2022 Free Software Foundation, Inc.
|
2017-10-06 15:59:32 +00:00
|
|
|
This file is part of the GNU C Library.
|
|
|
|
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with the GNU C Library. If not, see
|
Prefer https to http for gnu.org and fsf.org URLs
Also, change sources.redhat.com to sourceware.org.
This patch was automatically generated by running the following shell
script, which uses GNU sed, and which avoids modifying files imported
from upstream:
sed -ri '
s,(http|ftp)(://(.*\.)?(gnu|fsf|sourceware)\.org($|[^.]|\.[^a-z])),https\2,g
s,(http|ftp)(://(.*\.)?)sources\.redhat\.com($|[^.]|\.[^a-z]),https\2sourceware.org\4,g
' \
$(find $(git ls-files) -prune -type f \
! -name '*.po' \
! -name 'ChangeLog*' \
! -path COPYING ! -path COPYING.LIB \
! -path manual/fdl-1.3.texi ! -path manual/lgpl-2.1.texi \
! -path manual/texinfo.tex ! -path scripts/config.guess \
! -path scripts/config.sub ! -path scripts/install-sh \
! -path scripts/mkinstalldirs ! -path scripts/move-if-change \
! -path INSTALL ! -path locale/programs/charmap-kw.h \
! -path po/libc.pot ! -path sysdeps/gnu/errlist.c \
! '(' -name configure \
-execdir test -f configure.ac -o -f configure.in ';' ')' \
! '(' -name preconfigure \
-execdir test -f preconfigure.ac ';' ')' \
-print)
and then by running 'make dist-prepare' to regenerate files built
from the altered files, and then executing the following to cleanup:
chmod a+x sysdeps/unix/sysv/linux/riscv/configure
# Omit irrelevant whitespace and comment-only changes,
# perhaps from a slightly-different Autoconf version.
git checkout -f \
sysdeps/csky/configure \
sysdeps/hppa/configure \
sysdeps/riscv/configure \
sysdeps/unix/sysv/linux/csky/configure
# Omit changes that caused a pre-commit check to fail like this:
# remote: *** error: sysdeps/powerpc/powerpc64/ppc-mcount.S: trailing lines
git checkout -f \
sysdeps/powerpc/powerpc64/ppc-mcount.S \
sysdeps/unix/sysv/linux/s390/s390-64/syscall.S
# Omit change that caused a pre-commit check to fail like this:
# remote: *** error: sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S: last line does not end in newline
git checkout -f sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S
2019-09-07 05:40:42 +00:00
|
|
|
<https://www.gnu.org/licenses/>. */
|
2017-10-06 15:59:32 +00:00
|
|
|
|
|
|
|
#include <sysdep.h>
|
|
|
|
|
|
|
|
/* For __ARM_NEON__ this file defines memchr. */
|
|
|
|
#ifndef __ARM_NEON__
|
2017-06-27 15:43:50 +00:00
|
|
|
# define memchr __memchr_neon
|
2017-10-06 15:59:32 +00:00
|
|
|
# undef libc_hidden_builtin_def
|
|
|
|
# define libc_hidden_builtin_def(a)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
.arch armv7-a
|
|
|
|
.fpu neon
|
|
|
|
|
|
|
|
|
|
|
|
/* Arguments */
|
|
|
|
#define srcin r0
|
|
|
|
#define chrin r1
|
|
|
|
#define cntin r2
|
|
|
|
|
|
|
|
/* Retval */
|
|
|
|
#define result r0 /* Live range does not overlap with srcin */
|
|
|
|
|
|
|
|
/* Working registers */
|
|
|
|
#define src r1 /* Live range does not overlap with chrin */
|
|
|
|
#define tmp r3
|
|
|
|
#define synd r0 /* No overlap with srcin or result */
|
|
|
|
#define soff r12
|
|
|
|
|
|
|
|
/* Working NEON registers */
|
|
|
|
#define vrepchr q0
|
|
|
|
#define vdata0 q1
|
|
|
|
#define vdata0_0 d2 /* Lower half of vdata0 */
|
|
|
|
#define vdata0_1 d3 /* Upper half of vdata0 */
|
|
|
|
#define vdata1 q2
|
|
|
|
#define vdata1_0 d4 /* Lower half of vhas_chr0 */
|
|
|
|
#define vdata1_1 d5 /* Upper half of vhas_chr0 */
|
|
|
|
#define vrepmask q3
|
|
|
|
#define vrepmask0 d6
|
|
|
|
#define vrepmask1 d7
|
|
|
|
#define vend q4
|
|
|
|
#define vend0 d8
|
|
|
|
#define vend1 d9
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Core algorithm:
|
|
|
|
*
|
|
|
|
* For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
|
|
|
|
* byte. Each bit is set if the relevant byte matched the requested character
|
|
|
|
* and cleared otherwise. Since the bits in the syndrome reflect exactly the
|
|
|
|
* order in which things occur in the original string, counting trailing zeros
|
|
|
|
* allows to identify exactly which byte has matched.
|
|
|
|
*/
|
|
|
|
|
|
|
|
.thumb_func
|
|
|
|
.p2align 4,,15
|
|
|
|
|
|
|
|
ENTRY(memchr)
|
|
|
|
/* Use a simple loop if there are less than 8 bytes to search. */
|
|
|
|
cmp cntin, #7
|
|
|
|
bhi .Llargestr
|
|
|
|
and chrin, chrin, #0xff
|
|
|
|
|
|
|
|
.Lsmallstr:
|
|
|
|
subs cntin, cntin, #1
|
|
|
|
blo .Lnotfound /* Return not found if reached end. */
|
|
|
|
ldrb tmp, [srcin], #1
|
|
|
|
cmp tmp, chrin
|
|
|
|
bne .Lsmallstr /* Loop again if not found. */
|
|
|
|
/* Otherwise fixup address and return. */
|
|
|
|
sub result, srcin, #1
|
|
|
|
bx lr
|
|
|
|
|
|
|
|
|
|
|
|
.Llargestr:
|
|
|
|
vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */
|
|
|
|
/*
|
|
|
|
* Magic constant 0x8040201008040201 allows us to identify which lane
|
|
|
|
* matches the requested byte.
|
|
|
|
*/
|
|
|
|
movw tmp, #0x0201
|
|
|
|
movt tmp, #0x0804
|
|
|
|
lsl soff, tmp, #4
|
|
|
|
vmov vrepmask0, tmp, soff
|
|
|
|
vmov vrepmask1, tmp, soff
|
|
|
|
/* Work with aligned 32-byte chunks */
|
|
|
|
bic src, srcin, #31
|
|
|
|
ands soff, srcin, #31
|
|
|
|
beq .Lloopintro /* Go straight to main loop if it's aligned. */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Input string is not 32-byte aligned. We calculate the syndrome
|
|
|
|
* value for the aligned 32 bytes block containing the first bytes
|
|
|
|
* and mask the irrelevant part.
|
|
|
|
*/
|
|
|
|
vld1.8 {vdata0, vdata1}, [src:256]!
|
|
|
|
sub tmp, soff, #32
|
|
|
|
adds cntin, cntin, tmp
|
|
|
|
vceq.i8 vdata0, vdata0, vrepchr
|
|
|
|
vceq.i8 vdata1, vdata1, vrepchr
|
|
|
|
vand vdata0, vdata0, vrepmask
|
|
|
|
vand vdata1, vdata1, vrepmask
|
|
|
|
vpadd.i8 vdata0_0, vdata0_0, vdata0_1
|
|
|
|
vpadd.i8 vdata1_0, vdata1_0, vdata1_1
|
|
|
|
vpadd.i8 vdata0_0, vdata0_0, vdata1_0
|
|
|
|
vpadd.i8 vdata0_0, vdata0_0, vdata0_0
|
|
|
|
vmov synd, vdata0_0[0]
|
|
|
|
|
|
|
|
/* Clear the soff lower bits */
|
|
|
|
lsr synd, synd, soff
|
|
|
|
lsl synd, synd, soff
|
|
|
|
/* The first block can also be the last */
|
|
|
|
bls .Lmasklast
|
|
|
|
/* Have we found something already? */
|
|
|
|
cbnz synd, .Ltail
|
2017-06-27 15:43:50 +00:00
|
|
|
|
2017-10-06 15:59:32 +00:00
|
|
|
|
|
|
|
.Lloopintro:
|
|
|
|
vpush {vend}
|
|
|
|
/* 264/265 correspond to d8/d9 for q4 */
|
|
|
|
cfi_adjust_cfa_offset (16)
|
|
|
|
cfi_rel_offset (264, 0)
|
|
|
|
cfi_rel_offset (265, 8)
|
|
|
|
.p2align 3,,7
|
|
|
|
.Lloop:
|
|
|
|
vld1.8 {vdata0, vdata1}, [src:256]!
|
|
|
|
subs cntin, cntin, #32
|
|
|
|
vceq.i8 vdata0, vdata0, vrepchr
|
|
|
|
vceq.i8 vdata1, vdata1, vrepchr
|
|
|
|
/* If we're out of data we finish regardless of the result. */
|
|
|
|
bls .Lend
|
|
|
|
/* Use a fast check for the termination condition. */
|
|
|
|
vorr vend, vdata0, vdata1
|
|
|
|
vorr vend0, vend0, vend1
|
|
|
|
vmov synd, tmp, vend0
|
|
|
|
orrs synd, synd, tmp
|
|
|
|
/* We're not out of data, loop if we haven't found the character. */
|
|
|
|
beq .Lloop
|
|
|
|
|
|
|
|
.Lend:
|
|
|
|
vpop {vend}
|
|
|
|
cfi_adjust_cfa_offset (-16)
|
|
|
|
cfi_restore (264)
|
|
|
|
cfi_restore (265)
|
|
|
|
|
|
|
|
/* Termination condition found, let's calculate the syndrome value. */
|
|
|
|
vand vdata0, vdata0, vrepmask
|
|
|
|
vand vdata1, vdata1, vrepmask
|
|
|
|
vpadd.i8 vdata0_0, vdata0_0, vdata0_1
|
|
|
|
vpadd.i8 vdata1_0, vdata1_0, vdata1_1
|
|
|
|
vpadd.i8 vdata0_0, vdata0_0, vdata1_0
|
|
|
|
vpadd.i8 vdata0_0, vdata0_0, vdata0_0
|
|
|
|
vmov synd, vdata0_0[0]
|
|
|
|
cbz synd, .Lnotfound
|
|
|
|
bhi .Ltail /* Uses the condition code from
|
|
|
|
subs cntin, cntin, #32 above. */
|
|
|
|
|
|
|
|
|
|
|
|
.Lmasklast:
|
|
|
|
/* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
|
|
|
|
neg cntin, cntin
|
|
|
|
lsl synd, synd, cntin
|
|
|
|
lsrs synd, synd, cntin
|
|
|
|
it eq
|
|
|
|
moveq src, #0 /* If no match, set src to 0 so the retval is 0. */
|
|
|
|
|
|
|
|
|
|
|
|
.Ltail:
|
|
|
|
/* Count the trailing zeros using bit reversing */
|
|
|
|
rbit synd, synd
|
|
|
|
/* Compensate the last post-increment */
|
|
|
|
sub src, src, #32
|
|
|
|
/* Count the leading zeros */
|
|
|
|
clz synd, synd
|
|
|
|
/* Compute the potential result and return */
|
|
|
|
add result, src, synd
|
|
|
|
bx lr
|
|
|
|
|
|
|
|
|
|
|
|
.Lnotfound:
|
|
|
|
/* Set result to NULL if not found and return */
|
|
|
|
mov result, #0
|
|
|
|
bx lr
|
|
|
|
|
|
|
|
END(memchr)
|
|
|
|
libc_hidden_builtin_def (memchr)
|