glibc/sysdeps/aarch64/strnlen.S

/* strnlen - calculate the length of a string with limit.

   Copyright (C) 2013-2020 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <https://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* Assumptions:
 *
 * ARMv8-a, AArch64
 */

/* Arguments and results.  */
#define srcin		x0
#define len		x0
#define limit		x1

/* Locals and temporaries.  */
#define src		x2
#define data1		x3
#define data2		x4
#define data2a		x5
#define has_nul1	x6
#define has_nul2	x7
#define tmp1		x8
#define tmp2		x9
#define tmp3		x10
#define tmp4		x11
#define zeroones	x12
#define pos		x13
#define limit_wd	x14

#define dataq		q2
#define datav		v2
#define datab2		b3
#define dataq2		q3
#define datav2		v3
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080

ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
	DELOUSE (0)
	DELOUSE (1)
	DELOUSE (2)
	cbz	limit, L(hit_limit)
	mov	zeroones, #REP8_01
	bic	src, srcin, #15
	ands	tmp1, srcin, #15
	b.ne	L(misaligned)
	/* Calculate the number of full and partial words -1.  */
	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */

	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
	   can be done in parallel across the entire word.  */
	/* The inner loop deals with two Dwords at a time.  This has a
	   slightly higher start-up cost, but we should win quite quickly,
	   especially on cores with a high number of issue slots per
	   cycle, as we get much better parallelism out of the operations.  */

	/* Start of critial section -- keep to one 64Byte cache line.  */

	ldp	data1, data2, [src], #16
L(realigned):
	sub	tmp1, data1, zeroones
	orr	tmp2, data1, #REP8_7f
	sub	tmp3, data2, zeroones
	orr	tmp4, data2, #REP8_7f
	bic	has_nul1, tmp1, tmp2
	bic	has_nul2, tmp3, tmp4
	subs	limit_wd, limit_wd, #1
	orr	tmp1, has_nul1, has_nul2
	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
	b.eq	L(loop)
	/* End of critical section -- keep to one 64Byte cache line.  */

	orr	tmp1, has_nul1, has_nul2
	cbz	tmp1, L(hit_limit)	/* No null in final Qword.  */

	/* We know there's a null in the final Qword.  The easiest thing
	   to do now is work out the length of the string and return
	   MIN (len, limit).  */

	sub	len, src, srcin
	cbz	has_nul1, L(nul_in_data2)
#ifdef __AARCH64EB__
	mov	data2, data1
#endif
	sub	len, len, #8
	mov	has_nul2, has_nul1
L(nul_in_data2):
#ifdef __AARCH64EB__
	/* For big-endian, carry propagation (if the final byte in the
	   string is 0x01) means we cannot use has_nul directly.  The
	   easiest way to get the correct byte is to byte-swap the data
	   and calculate the syndrome a second time.  */
	rev	data2, data2
	sub	tmp1, data2, zeroones
	orr	tmp2, data2, #REP8_7f
	bic	has_nul2, tmp1, tmp2
#endif
	sub	len, len, #8
	rev	has_nul2, has_nul2
	clz	pos, has_nul2
	add	len, len, pos, lsr #3		/* Bits to bytes.  */
	cmp	len, limit
	csel	len, len, limit, ls		/* Return the lower value.  */
	RET

L(loop):
	ldr	dataq, [src], #16
	uminv	datab2, datav.16b
	mov	tmp1, datav2.d[0]
	subs	limit_wd, limit_wd, #1
	ccmp	tmp1, #0, #4, pl	/* NZCV = 0000  */
	b.eq	L(loop_end)
	ldr	dataq, [src], #16
	uminv	datab2, datav.16b
	mov	tmp1, datav2.d[0]
	subs	limit_wd, limit_wd, #1
	ccmp	tmp1, #0, #4, pl	/* NZCV = 0000  */
	b.ne	L(loop)
L(loop_end):
	/* End of critical section -- keep to one 64Byte cache line.  */

	cbnz	tmp1, L(hit_limit)	/* No null in final Qword.  */

	/* We know there's a null in the final Qword.  The easiest thing
	   to do now is work out the length of the string and return
	   MIN (len, limit).  */

#ifdef __AARCH64EB__
	rev64	datav.16b, datav.16b
#endif
	/* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
	   pair of scalars and then compute the length from the earliest NULL
	   byte.  */

	cmeq	datav.16b, datav.16b, #0
#ifdef __AARCH64EB__
	mov	data1, datav.d[1]
	mov	data2, datav.d[0]
#else
	mov	data1, datav.d[0]
	mov	data2, datav.d[1]
#endif
	cmp	data1, 0
	csel	data1, data1, data2, ne
	sub	len, src, srcin
	sub	len, len, #16
	rev	data1, data1
	add	tmp2, len, 8
	clz	tmp1, data1
	csel	len, len, tmp2, ne
	add	len, len, tmp1, lsr 3
	cmp	len, limit
	csel	len, len, limit, ls		/* Return the lower value.  */
	RET

L(misaligned):
	/* Deal with a partial first word.
	   We're doing two things in parallel here;
	   1) Calculate the number of words (but avoiding overflow if
	      limit is near ULONG_MAX) - to do this we need to work out
	      limit + tmp1 - 1 as a 65-bit value before shifting it;
	   2) Load and mask the initial data words - we force the bytes
	      before the ones we are interested in to 0xff - this ensures
	      early bytes will not hit any zero detection.  */
	sub	limit_wd, limit, #1
	neg	tmp4, tmp1
	cmp	tmp1, #8

	and	tmp3, limit_wd, #15
	lsr	limit_wd, limit_wd, #4
	mov	tmp2, #~0

	ldp	data1, data2, [src], #16
	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
	add	tmp3, tmp3, tmp1

#ifdef __AARCH64EB__
	/* Big-endian.  Early bytes are at MSB.  */
	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
#else
	/* Little-endian.  Early bytes are at LSB.  */
	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
#endif
	add	limit_wd, limit_wd, tmp3, lsr #4

	orr	data1, data1, tmp2
	orr	data2a, data2, tmp2

	csinv	data1, data1, xzr, le
	csel	data2, data2, data2a, le
	b	L(realigned)

L(hit_limit):
	mov	len, limit
	RET
END (__strnlen)
libc_hidden_def (__strnlen)
weak_alias (__strnlen, strnlen)
libc_hidden_def (strnlen)
AArch64: Adding optimized strnlen implementation. 2013-01-23 16:32:33 +00:00			`/* strnlen - calculate the length of a string with limit.`

Update copyright dates with scripts/update-copyrights. 2020-01-01 00:14:33 +00:00			`Copyright (C) 2013-2020 Free Software Foundation, Inc.`
AArch64: Adding optimized strnlen implementation. 2013-01-23 16:32:33 +00:00
			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with the GNU C Library. If not, see`
Prefer https to http for gnu.org and fsf.org URLs Also, change sources.redhat.com to sourceware.org. This patch was automatically generated by running the following shell script, which uses GNU sed, and which avoids modifying files imported from upstream: sed -ri ' s,(http\|ftp)(://(.\.)?(gnu\|fsf\|sourceware)\.org($\|[^.]\|\.[^a-z])),https\2,g s,(http\|ftp)(://(.\.)?)sources\.redhat\.com($\|[^.]\|\.[^a-z]),https\2sourceware.org\4,g ' \ $(find $(git ls-files) -prune -type f \ ! -name '.po' \ ! -name 'ChangeLog' \ ! -path COPYING ! -path COPYING.LIB \ ! -path manual/fdl-1.3.texi ! -path manual/lgpl-2.1.texi \ ! -path manual/texinfo.tex ! -path scripts/config.guess \ ! -path scripts/config.sub ! -path scripts/install-sh \ ! -path scripts/mkinstalldirs ! -path scripts/move-if-change \ ! -path INSTALL ! -path locale/programs/charmap-kw.h \ ! -path po/libc.pot ! -path sysdeps/gnu/errlist.c \ ! '(' -name configure \ -execdir test -f configure.ac -o -f configure.in ';' ')' \ ! '(' -name preconfigure \ -execdir test -f preconfigure.ac ';' ')' \ -print) and then by running 'make dist-prepare' to regenerate files built from the altered files, and then executing the following to cleanup: chmod a+x sysdeps/unix/sysv/linux/riscv/configure # Omit irrelevant whitespace and comment-only changes, # perhaps from a slightly-different Autoconf version. git checkout -f \ sysdeps/csky/configure \ sysdeps/hppa/configure \ sysdeps/riscv/configure \ sysdeps/unix/sysv/linux/csky/configure # Omit changes that caused a pre-commit check to fail like this: # remote: * error: sysdeps/powerpc/powerpc64/ppc-mcount.S: trailing lines git checkout -f \ sysdeps/powerpc/powerpc64/ppc-mcount.S \ sysdeps/unix/sysv/linux/s390/s390-64/syscall.S # Omit change that caused a pre-commit check to fail like this: # remote: * error: sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S: last line does not end in newline git checkout -f sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S 2019-09-07 05:40:42 +00:00			`<https://www.gnu.org/licenses/>. */`
AArch64: Adding optimized strnlen implementation. 2013-01-23 16:32:33 +00:00
			`#include <sysdep.h>`

			`/* Assumptions:`
			`*`
			`* ARMv8-a, AArch64`
			`*/`

			`/* Arguments and results. */`
			`#define srcin x0`
			`#define len x0`
			`#define limit x1`

			`/* Locals and temporaries. */`
			`#define src x2`
			`#define data1 x3`
			`#define data2 x4`
			`#define data2a x5`
			`#define has_nul1 x6`
			`#define has_nul2 x7`
			`#define tmp1 x8`
			`#define tmp2 x9`
			`#define tmp3 x10`
			`#define tmp4 x11`
			`#define zeroones x12`
			`#define pos x13`
			`#define limit_wd x14`

aarch64: Optimized implementation of strnlen Optimize the strlen implementation by using vector operations and loop unrooling in main loop. Compared to aarch64/strnlen.S, it reduces latency of cases in bench-strnlen by 11%~24% when the length of src is greater than 64 bytes, with gains throughout the benchmark. Checked on aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2019-12-19 13:49:46 +00:00			`#define dataq q2`
			`#define datav v2`
			`#define datab2 b3`
			`#define dataq2 q3`
			`#define datav2 v3`
AArch64: Adding optimized strnlen implementation. 2013-01-23 16:32:33 +00:00			`#define REP8_01 0x0101010101010101`
			`#define REP8_7f 0x7f7f7f7f7f7f7f7f`
			`#define REP8_80 0x8080808080808080`

			`ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)`
Partial ILP32 support for aarch64. * sysdeps/aarch64/crti.S: Add include of sysdep.h. (call_weak_fn): Use PTR_REG to get correct reg name in ILP32. * sysdeps/aarch64/dl-irel.h: Add include of sysdep.h. (elf_irela): Use AARCH64_R macro to get correct relocation in ILP32. * sysdeps/aarch64/dl-machine.h: Add include of sysdep.h. (elf_machine_load_address, RTLD_START, RTLD_START_1, RTLD_START, elf_machine_type_class, ELF_MACHINE_JMP_SLOT, elf_machine_rela, elf_machine_lazy_rel): Add ifdef's for ILP32 support. * sysdeps/aarch64/dl-tlsdesc.S (_dl_tlsdesc_return, _dl_tlsdesc_return_lazy, _dl_tlsdesc_dynamic, _dl_tlsdesc_resolve_hold): Extend pointers in ILP32, use PTR_REG to get correct reg name for ILP32. * sysdeps/aarch64/dl-trampoline.S (ip01): New Macro. (RELA_SIZE): New Macro. (_dl_runtime_resolve, _dl_runtime_profile): Use new macros and PTR_REG to support ILP32. * sysdeps/aarch64/jmpbuf-unwind.h (_JMPBUF_CFA_UNWINDS_ADJ): Add cast for ILP32 mode. * sysdeps/aarch64/memcmp.S (memcmp): Extend arg pointers for ILP32 mode. * sysdeps/aarch64/memcpy.S (memmove, memcpy): Ditto. * sysdeps/aarch64/memset.S (__memset): Ditto. * sysdeps/aarch64/strchr.S (strchr): Ditto. * sysdeps/aarch64/strchrnul.S (__strchrnul): Ditto. * sysdeps/aarch64/strcmp.S (strcmp): Ditto. * sysdeps/aarch64/strcpy.S (strcpy): Ditto. * sysdeps/aarch64/strlen.S (__strlen): Ditto. * sysdeps/aarch64/strncmp.S (strncmp): Ditto. * sysdeps/aarch64/strnlen.S (strnlen): Ditto. * sysdeps/aarch64/strrchr.S (strrchr): Ditto. * sysdeps/unix/sysv/linux/aarch64/clone.S: Ditto. * sysdeps/unix/sysv/linux/aarch64/setcontext.S (__setcontext): Ditto. * sysdeps/unix/sysv/linux/aarch64/swapcontext.S (__swapcontext): Ditto. * sysdeps/aarch64/__longjmp.S (__longjmp): Extend pointers in ILP32, change PTR_MANGLE call to use register numbers instead of names. * sysdeps/unix/sysv/linux/aarch64/getcontext.S (__getcontext): Ditto. * sysdeps/aarch64/setjmp.S (__sigsetjmp): Extend arg pointers for ILP32 mode, change PTR_MANGLE calls to use register numbers. * sysdeps/aarch64/start.S (_start): Ditto. * sysdeps/aarch64/nptl/bits/pthreadtypes.h (__PTHREAD_RWLOCK_INT_FLAGS_SHARED): New define. (__SIZEOF_PTHREAD_ATTR_T, __SIZEOF_PTHREAD_MUTEX_T, __SIZEOF_PTHREAD_MUTEXATTR_T, __SIZEOF_PTHREAD_COND_T, __SIZEOF_PTHREAD_COND_COMPAT_T, __SIZEOF_PTHREAD_CONDATTR_T, __SIZEOF_PTHREAD_RWLOCK_T, __SIZEOF_PTHREAD_RWLOCKATTR_T, __SIZEOF_PTHREAD_BARRIER_T, __SIZEOF_PTHREAD_BARRIERATTR_T): Make defined values dependent on __ILP32__. * sysdeps/aarch64/nptl/bits/semaphore.h (__SIZEOF_SEM_T): Change define. (sem_t): Change __align type. * sysdeps/aarch64/sysdep.h (AARCH64_R, PTR_REG, PTR_LOG_SIZE, DELOUSE, PTR_SIZE): New Macros. (LDST_PCREL, LDST_GLOBAL) Update to use PTR_REG. * sysdeps/unix/sysv/linux/aarch64/bits/fcntl.h (O_LARGEFILE): Set when in ILP32 mode. (F_GETLK64, F_SETLK64, F_SETLKW64): Only set in LP64 mode. * sysdeps/unix/sysv/linux/aarch64/dl-cache.h (DL_CACHE_DEFAULT_ID): Set elf flags for ILP32. (add_system_dir): Set ILP32 library directories. * sysdeps/unix/sysv/linux/aarch64/init-first.c (_libc_vdso_platform_setup): Set minimum kernel version for ILP32. * sysdeps/unix/sysv/linux/aarch64/ldconfig.h (SYSDEP_KNOWN_INTERPRETER_NAMES): Add ILP32 names. * sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h (GET_PC, SET_PC): New Macros. * sysdeps/unix/sysv/linux/aarch64/sysdep.h: Handle ILP32 pointers. 2016-11-28 17:01:23 +00:00			`DELOUSE (0)`
			`DELOUSE (1)`
			`DELOUSE (2)`
AArch64: Adding optimized strnlen implementation. 2013-01-23 16:32:33 +00:00			`cbz limit, L(hit_limit)`
			`mov zeroones, #REP8_01`
			`bic src, srcin, #15`
			`ands tmp1, srcin, #15`
			`b.ne L(misaligned)`
			`/* Calculate the number of full and partial words -1. */`
			`sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */`
			`lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */`

			`/* NUL detection works on the principle that (X - 1) & (~X) & 0x80`
			`(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and`
			`can be done in parallel across the entire word. */`
			`/* The inner loop deals with two Dwords at a time. This has a`
			`slightly higher start-up cost, but we should win quite quickly,`
			`especially on cores with a high number of issue slots per`
			`cycle, as we get much better parallelism out of the operations. */`

			`/* Start of critial section -- keep to one 64Byte cache line. */`
aarch64: Optimized implementation of strnlen Optimize the strlen implementation by using vector operations and loop unrooling in main loop. Compared to aarch64/strnlen.S, it reduces latency of cases in bench-strnlen by 11%~24% when the length of src is greater than 64 bytes, with gains throughout the benchmark. Checked on aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2019-12-19 13:49:46 +00:00
AArch64: Adding optimized strnlen implementation. 2013-01-23 16:32:33 +00:00			`ldp data1, data2, [src], #16`
			`L(realigned):`
			`sub tmp1, data1, zeroones`
			`orr tmp2, data1, #REP8_7f`
			`sub tmp3, data2, zeroones`
			`orr tmp4, data2, #REP8_7f`
			`bic has_nul1, tmp1, tmp2`
			`bic has_nul2, tmp3, tmp4`
			`subs limit_wd, limit_wd, #1`
			`orr tmp1, has_nul1, has_nul2`
			`ccmp tmp1, #0, #0, pl /* NZCV = 0000 */`
			`b.eq L(loop)`
			`/* End of critical section -- keep to one 64Byte cache line. */`

			`orr tmp1, has_nul1, has_nul2`
			`cbz tmp1, L(hit_limit) /* No null in final Qword. */`

			`/* We know there's a null in the final Qword. The easiest thing`
			`to do now is work out the length of the string and return`
			`MIN (len, limit). */`

			`sub len, src, srcin`
			`cbz has_nul1, L(nul_in_data2)`
			`#ifdef __AARCH64EB__`
			`mov data2, data1`
			`#endif`
			`sub len, len, #8`
			`mov has_nul2, has_nul1`
			`L(nul_in_data2):`
			`#ifdef __AARCH64EB__`
			`/* For big-endian, carry propagation (if the final byte in the`
			`string is 0x01) means we cannot use has_nul directly. The`
			`easiest way to get the correct byte is to byte-swap the data`
			`and calculate the syndrome a second time. */`
			`rev data2, data2`
			`sub tmp1, data2, zeroones`
			`orr tmp2, data2, #REP8_7f`
			`bic has_nul2, tmp1, tmp2`
			`#endif`
			`sub len, len, #8`
			`rev has_nul2, has_nul2`
			`clz pos, has_nul2`
			`add len, len, pos, lsr #3 /* Bits to bytes. */`
			`cmp len, limit`
			`csel len, len, limit, ls /* Return the lower value. */`
			`RET`

aarch64: Optimized implementation of strnlen Optimize the strlen implementation by using vector operations and loop unrooling in main loop. Compared to aarch64/strnlen.S, it reduces latency of cases in bench-strnlen by 11%~24% when the length of src is greater than 64 bytes, with gains throughout the benchmark. Checked on aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2019-12-19 13:49:46 +00:00			`L(loop):`
			`ldr dataq, [src], #16`
			`uminv datab2, datav.16b`
			`mov tmp1, datav2.d[0]`
			`subs limit_wd, limit_wd, #1`
			`ccmp tmp1, #0, #4, pl /* NZCV = 0000 */`
			`b.eq L(loop_end)`
			`ldr dataq, [src], #16`
			`uminv datab2, datav.16b`
			`mov tmp1, datav2.d[0]`
			`subs limit_wd, limit_wd, #1`
			`ccmp tmp1, #0, #4, pl /* NZCV = 0000 */`
			`b.ne L(loop)`
			`L(loop_end):`
			`/* End of critical section -- keep to one 64Byte cache line. */`

			`cbnz tmp1, L(hit_limit) /* No null in final Qword. */`

			`/* We know there's a null in the final Qword. The easiest thing`
			`to do now is work out the length of the string and return`
			`MIN (len, limit). */`

			`#ifdef __AARCH64EB__`
			`rev64 datav.16b, datav.16b`
			`#endif`
			`/* Set te NULL byte as 0xff and the rest as 0x00, move the data into a`
			`pair of scalars and then compute the length from the earliest NULL`
			`byte. */`

			`cmeq datav.16b, datav.16b, #0`
aarch64: fix strcpy and strnlen for big-endian [BZ #25824] This patch fixes the optimized implementation of strcpy and strnlen on a big-endian arm64 machine. The optimized method uses neon, which can process 128bit with one instruction. On a big-endian machine, the bit order should be reversed for the whole 128-bits double word. But with instuction rev64 datav.16b, datav.16b it reverses 64bits in the two halves rather than reversing 128bits. There is no such instruction as rev128 to reverse the 128bits, but we can fix this by loading the data registers accordingly. Fixes 0237b61526e7("aarch64: Optimized implementation of strcpy") and 2911cb68ed3d("aarch64: Optimized implementation of strnlen"). Signed-off-by: Lexi Shao <shaolexi@huawei.com> Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2020-05-15 10:48:59 +00:00			`#ifdef __AARCH64EB__`
			`mov data1, datav.d[1]`
			`mov data2, datav.d[0]`
			`#else`
aarch64: Optimized implementation of strnlen Optimize the strlen implementation by using vector operations and loop unrooling in main loop. Compared to aarch64/strnlen.S, it reduces latency of cases in bench-strnlen by 11%~24% when the length of src is greater than 64 bytes, with gains throughout the benchmark. Checked on aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2019-12-19 13:49:46 +00:00			`mov data1, datav.d[0]`
			`mov data2, datav.d[1]`
aarch64: fix strcpy and strnlen for big-endian [BZ #25824] This patch fixes the optimized implementation of strcpy and strnlen on a big-endian arm64 machine. The optimized method uses neon, which can process 128bit with one instruction. On a big-endian machine, the bit order should be reversed for the whole 128-bits double word. But with instuction rev64 datav.16b, datav.16b it reverses 64bits in the two halves rather than reversing 128bits. There is no such instruction as rev128 to reverse the 128bits, but we can fix this by loading the data registers accordingly. Fixes 0237b61526e7("aarch64: Optimized implementation of strcpy") and 2911cb68ed3d("aarch64: Optimized implementation of strnlen"). Signed-off-by: Lexi Shao <shaolexi@huawei.com> Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> 2020-05-15 10:48:59 +00:00			`#endif`
aarch64: Optimized implementation of strnlen Optimize the strlen implementation by using vector operations and loop unrooling in main loop. Compared to aarch64/strnlen.S, it reduces latency of cases in bench-strnlen by 11%~24% when the length of src is greater than 64 bytes, with gains throughout the benchmark. Checked on aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2019-12-19 13:49:46 +00:00			`cmp data1, 0`
			`csel data1, data1, data2, ne`
			`sub len, src, srcin`
			`sub len, len, #16`
			`rev data1, data1`
			`add tmp2, len, 8`
			`clz tmp1, data1`
			`csel len, len, tmp2, ne`
			`add len, len, tmp1, lsr 3`
			`cmp len, limit`
			`csel len, len, limit, ls /* Return the lower value. */`
			`RET`

AArch64: Adding optimized strnlen implementation. 2013-01-23 16:32:33 +00:00			`L(misaligned):`
			`/* Deal with a partial first word.`
			`We're doing two things in parallel here;`
			`1) Calculate the number of words (but avoiding overflow if`
			`limit is near ULONG_MAX) - to do this we need to work out`
			`limit + tmp1 - 1 as a 65-bit value before shifting it;`
			`2) Load and mask the initial data words - we force the bytes`
			`before the ones we are interested in to 0xff - this ensures`
			`early bytes will not hit any zero detection. */`
			`sub limit_wd, limit, #1`
			`neg tmp4, tmp1`
			`cmp tmp1, #8`

			`and tmp3, limit_wd, #15`
			`lsr limit_wd, limit_wd, #4`
			`mov tmp2, #~0`

			`ldp data1, data2, [src], #16`
			`lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */`
			`add tmp3, tmp3, tmp1`

			`#ifdef __AARCH64EB__`
			`/* Big-endian. Early bytes are at MSB. */`
			`lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */`
			`#else`
			`/* Little-endian. Early bytes are at LSB. */`
			`lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */`
			`#endif`
			`add limit_wd, limit_wd, tmp3, lsr #4`

			`orr data1, data1, tmp2`
			`orr data2a, data2, tmp2`

			`csinv data1, data1, xzr, le`
			`csel data2, data2, data2a, le`
			`b L(realigned)`

			`L(hit_limit):`
			`mov len, limit`
			`RET`
			`END (__strnlen)`
Use libc_hidden_proto / libc_hidden_def with __strnlen. Various code in glibc uses __strnlen instead of strnlen for namespace reasons. However, __strnlen does not use libc_hidden_proto / libc_hidden_def (as is normally done for any function defined and called within the same library, whether or not exported from the library and whatever namespace it is in), so the compiler does not know that those calls are to a function within libc. This patch uses libc_hidden_proto / libc_hidden_def with __strnlen. On x86_64, it makes no difference to the installed stripped shared libraries. On 32-bit x86, it causes __strnlen calls to go to the same place as strnlen calls (the fallback strnlen implementation), rather than through a PLT entry for the strnlen IFUNC; I'm not sure of the logic behind when calls from within libc should use IFUNCs versus when they should go direct to a particular function implementation, but clearly it doesn't make sense for strnlen and __strnlen to be handled differently in this regard. Tested for x86_64 and x86 (testsuite, and comparison of installed shared libraries as described above). * string/strnlen.c [!STRNLEN] (__strnlen): Use libc_hidden_def. * include/string.h (__strnlen): Use libc_hidden_proto. * sysdeps/aarch64/strnlen.S (__strnlen): Use libc_hidden_def. * sysdeps/i386/i686/multiarch/strnlen-c.c [SHARED] (libc_hidden_def): Define __GI___strnlen as well as __GI_strnlen. * sysdeps/powerpc/powerpc32/power4/multiarch/strnlen-power7.S (libc_hidden_def): Undefine and redefine. * sysdeps/powerpc/powerpc32/power4/multiarch/strnlen-ppc32.c [SHARED] (libc_hidden_def): Define __GI___strnlen as well as __GI_strnlen. * sysdeps/powerpc/powerpc32/power7/strnlen.S (__strnlen): Use libc_hidden_def. * sysdeps/tile/tilegx/strnlen.c (__strnlen): Likewise. 2015-06-02 20:24:25 +00:00			`libc_hidden_def (__strnlen)`
AArch64: Adding optimized strnlen implementation. 2013-01-23 16:32:33 +00:00			`weak_alias (__strnlen, strnlen)`
			`libc_hidden_def (strnlen)`