glibc/sysdeps/aarch64/multiarch/memchr_nosimd.S

224 lines
4.9 KiB
ArmAsm

/* memchr - find a character in a memory zone using base integer registers
Copyright (C) 2018-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
/* Assumptions:
*
* ARMv8-a, AArch64
* Use base integer registers.
*/
#ifndef MEMCHR
# define MEMCHR __memchr_nosimd
#endif
/* Arguments and results. */
#define srcin x0
#define chrin x1
#define cntin x2
#define result x0
#define repchr x1
#define tmp1 x2
#define tmp2 x3
#define tmp3 x4
#define tmp4 x5
#define src x6
#define srcend x7
#define srcend16 x8
#define anymore x9
#define zeroones x10
#define data1 x11
#define data2 x12
#define has_chr1 x13
#define has_chr2 x14
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
ENTRY_ALIGN (MEMCHR, 6)
DELOUSE (0)
DELOUSE (2)
/* Do not dereference srcin if no bytes to compare. */
cbz cntin, L(none_chr)
/* Start address is 16-byte aligned or not? */
tst srcin, 15
bic src, srcin, 15
mov zeroones, REP8_01
and repchr, chrin, 255
/* Generate a qword integer as |c|c|c|c|c|c|c|c|. */
mul repchr, repchr, zeroones
add srcend, srcin, cntin
/*
* srcend16 is address of the block following the last block.
*
* [A block is 16-byte aligned and sized.]
*/
add srcend16, srcend, 15
bic srcend16, srcend16, 15
b.eq L(loop)
/* Load the first block containing start address. */
ldp data1, data2, [src], 16
lsl tmp1, srcin, 3
mov tmp2, ~0
#ifdef __AARCH64EB__
lsr tmp3, tmp2, tmp1
#else
lsl tmp3, tmp2, tmp1
#endif
/* Start address is in the first or the second qword? */
tst srcin, 8
/*
* Transform any byte in the block to zero using XOR operation,
* if that byte equals the char to search. In this way, searching
* the char becomes detecting zero in the resulting two qwords.
*/
eor data1, data1, repchr
eor data2, data2, repchr
/*
* Set those unused bytes(before start address) to 0xff, so
* that they will not hit any zero detection.
*/
orn tmp1, data1, tmp3
orn tmp2, data2, tmp3
csinv data1, tmp1, xzr, eq
csel data2, data2, tmp2, eq
/*
* When the first and last block are the same, there are two cases:
* o. Memory range to search is just in one block.
* ( start address - end address) < 0
*
* o. Memory range is so large that end address wrap-around.
* ( start address - end address) > 0
*/
cmp srcin, srcend
ccmp src, srcend16, 0, mi
csetm anymore, ne
b L(find_chr)
.p2align 4
L(loop):
ldp data1, data2, [src], 16
subs anymore, src, srcend16
/*
* Transform any byte in the block to zero using XOR operation,
* if that byte equals the char to search.
*/
eor data1, data1, repchr
eor data2, data2, repchr
L(find_chr):
/*
* Use the following integer test to find out if any byte in a
* qword is zero. If do not contain zero-valued byte, test result
* is zero.
*
* (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080
* =
* (qword - 0x0101010101010101) & ~(qword | 0x7f7f7f7f7f7f7f7f)
*
*/
sub tmp1, data1, zeroones
sub tmp2, data2, zeroones
orr tmp3, data1, REP8_7f
orr tmp4, data2, REP8_7f
bic has_chr1, tmp1, tmp3
bic has_chr2, tmp2, tmp4
orr tmp1, has_chr1, has_chr2
ccmp tmp1, 0, 0, ne
b.eq L(loop)
cbz has_chr1, 1f
sub result, src, 16
#ifdef __AARCH64EB__
rev data1, data1
#else
rev has_chr1, has_chr1
#endif
b L(done)
1: cbz has_chr2, L(none_chr)
sub result, src, 8
#ifdef __AARCH64EB__
rev data1, data2
#else
rev has_chr1, has_chr2
#endif
L(done):
#ifdef __AARCH64EB__
/*
* For big-endian, can not directly use has_chr1/has_chr2 because
* two qwords has been reversed after loading from memory.
* Thus, have to perform char detection on two qwords again, which
* should be byte-swapped this time.
*/
sub tmp1, data1, zeroones
orr tmp3, data1, REP8_7f
bic has_chr1, tmp1, tmp3
rev has_chr1, has_chr1
#endif
/*
* If the specified char is found in a qword, the corresponding
* byte of in has_chr has value of 1, while this is only true for
* the first occurrence, not other occurrences.
*/
cmp anymore, 0
clz tmp1, has_chr1
add result, result, tmp1, lsr 3
ccmp result, srcend, 8, eq /* NZCV = 8000 */
csel result, result, xzr, mi
ret
L(none_chr):
mov result, 0
ret
END (MEMCHR)
libc_hidden_builtin_def (MEMCHR)