AArch64: Improve strrchr

Use shrn for narrowing the mask which simplifies code and speeds up small strings. Unroll the first search loop to improve performance on large strings. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> (cherry picked from commit 55599d4804)
2025-01-08 10:20:15 +00:00 · 2023-01-11 13:53:19 +00:00 · 2023-01-11 13:53:19 +00:00 · 3972a306be
commit 3972a306be
parent 34bf1b598e
1 changed files with 34 additions and 26 deletions
--- a/sysdeps/aarch64/strrchr.S
+++ b/sysdeps/aarch64/strrchr.S
@ -22,19 +22,16 @@
 /* Assumptions:
 *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, Advanced SIMD.
 * Neon Available.
 * MTE compatible.
 */
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
 #define result		x0
 #define src		x2
 #define tmp		x3
 #define wtmp		w3
 #define synd		x3
 #define shift		x4
 #define src_match	x4
@ -46,7 +43,6 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
 #define vrepmask2	v5
 #define vend		v5
 #define dend		d5
@ -58,59 +54,71 @@
   the relevant byte matched the requested character; bits 2-3 are set
   if the relevant byte matched the NUL end of string.  */
-ENTRY(strrchr)
+ENTRY (strrchr)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0x3003
+	movi	vrepmask.16b, 0x33
-	dup	vrepmask.8h, wtmp
+	ld1	{vdata.16b}, [src]
 	tst	srcin, 15
 	beq	L(loop1)
 	ld1	{vdata.16b}, [src], 16
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	mov	wtmp, 0xf00f
 	dup	vrepmask2.8h, wtmp
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	lsl	shift, srcin, 2
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	lsl	synd, synd, shift
 	ands	nul_match, synd, 0xcccccccccccccccc
 	bne	L(tail)
-	cbnz	synd, L(loop2)
+	cbnz	synd, L(loop2_start)
-	.p2align 5
+	.p2align 4
 L(loop1):
-	ld1	{vdata.16b}, [src], 16
+	ldr	q1, [src, 16]
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbnz	synd, L(loop1_end)
 	ldr	q1, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop1)
-
+	sub	src, src, 16
 L(loop1_end):
 	add	src, src, 16
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+#ifdef __AARCH64EB__
-	bic	vhas_nul.8h, 0x0f, lsl 8
+	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
 	rbit	synd, synd
 #else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
 	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
 #endif
 	ands	nul_match, synd, 0xcccccccccccccccc
-	beq	L(loop2)
+	beq	L(loop2_start)
 L(tail):
 	sub	nul_match, nul_match, 1
 	and	chr_match, synd, 0x3333333333333333
 	ands	chr_match, chr_match, nul_match
-	sub	result, src, 1
+	add	result, src, 15
 	clz	tmp, chr_match
 	sub	result, result, tmp, lsr 2
 	csel	result, result, xzr, ne
 	ret
 	.p2align 4
 	nop
 	nop
 L(loop2_start):
 	add	src, src, 16
 	bic	vrepmask.8h, 0xf0
 L(loop2):
 	cmp	synd, 0
 	csel	src_match, src, src_match, ne