x86: Optimize strchr-avx2.S

No bug. This commit optimizes strchr-avx2.S. The optimizations are all small things such as save an ALU in the alignment process, saving a few instructions in the loop return, saving some bytes in the main loop, and increasing the ILP in the return cases. test-strchr, test-strchrnul, test-wcschr, and test-wcschrnul are all passing. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
2024-11-14 09:01:07 +00:00 · 2021-04-23 15:56:24 -04:00 · 2021-04-23 15:56:24 -04:00 · ccabe7971f
commit ccabe7971f
parent 24f261f27f
1 changed files with 172 additions and 120 deletions
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@ -49,132 +49,144 @@
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCHR)
 	movl	%edi, %ecx
 # ifndef USE_AS_STRCHRNUL
 	xorl	%edx, %edx
 # endif
 	/* Broadcast CHAR to YMM0.	*/
 	vmovd	%esi, %xmm0
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	VPBROADCAST	%xmm0, %ymm0
 	vpxor	%xmm9, %xmm9, %xmm9
 	VPBROADCAST %xmm0, %ymm0
 	/* Check if we cross page boundary with one vector load.  */
-	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(cross_page_boundary)
 	ja  L(cross_page_boundary)
 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 	   null byte.  */
 	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jz	L(more_vecs)
+	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
 	jne	L(zero)
 # endif
 	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
 	cmp (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 	.p2align 4
 L(more_vecs):
 	/* Align data for aligned loads in the loop.  */
 	andq	$-VEC_SIZE, %rdi
 L(aligned_more):
 	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.	*/
 	vmovdqa	VEC_SIZE(%rdi), %ymm8
 	addq	$VEC_SIZE, %rdi
 	VPCMPEQ %ymm8, %ymm0, %ymm1
 	VPCMPEQ %ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x0)
 	vmovdqa	VEC_SIZE(%rdi), %ymm8
 	VPCMPEQ %ymm8, %ymm0, %ymm1
 	VPCMPEQ %ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
 	VPCMPEQ %ymm8, %ymm0, %ymm1
 	VPCMPEQ %ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 	VPCMPEQ %ymm8, %ymm0, %ymm1
 	VPCMPEQ %ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jz	L(prep_loop_4x)
 	tzcntl	%eax, %eax
 	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
 # ifndef USE_AS_STRCHRNUL
 	cmp (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER_RETURN
-	.p2align 4
+	/* .p2align 5 helps keep performance more consistent if ENTRY()
-L(first_vec_x0):
+	   alignment % 32 was either 16 or 0. As well this makes the
 	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
 	   easier.  */
 	.p2align 5
 L(first_vec_x4):
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
+	addq	$(VEC_SIZE * 3 + 1), %rdi
 	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
+	/* Found CHAR or the null byte.	 */
-	cmovne	%rdx, %rax
+	cmp	(%rdi, %rax), %CHAR_REG
 	jne	L(zero)
 # endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 # ifndef USE_AS_STRCHRNUL
 L(zero):
 	xorl	%eax, %eax
 	VZEROUPPER_RETURN
 # endif
 	.p2align 4
 L(first_vec_x1):
 	tzcntl	%eax, %eax
-	leaq	VEC_SIZE(%rdi, %rax), %rax
+	incq	%rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
+	/* Found CHAR or the null byte.	 */
-	cmovne	%rdx, %rax
+	cmp	(%rdi, %rax), %CHAR_REG
 	jne	L(zero)
 # endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
+	addq	$(VEC_SIZE + 1), %rdi
 	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
+	/* Found CHAR or the null byte.	 */
-	cmovne	%rdx, %rax
+	cmp	(%rdi, %rax), %CHAR_REG
 	jne	L(zero)
 # endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-L(prep_loop_4x):
+	.p2align 4
-	/* Align data to 4 * VEC_SIZE.	*/
+L(first_vec_x3):
-	andq	$-(VEC_SIZE * 4), %rdi
+	tzcntl	%eax, %eax
 	addq	$(VEC_SIZE * 2 + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
 	jne	L(zero)
 # endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 	.p2align 4
 L(aligned_more):
 	/* Align data to VEC_SIZE - 1. This is the same number of
 	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
 	   on x4 check.  */
 	orq	$(VEC_SIZE - 1), %rdi
 L(cross_page_continue):
 	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
 	vmovdqa	1(%rdi), %ymm8
 	VPCMPEQ	%ymm8, %ymm0, %ymm1
 	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
 	VPCMPEQ	%ymm8, %ymm0, %ymm1
 	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
 	VPCMPEQ	%ymm8, %ymm0, %ymm1
 	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
 	VPCMPEQ	%ymm8, %ymm0, %ymm1
 	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x4)
 	/* Align data to VEC_SIZE * 4 - 1.	*/
 	addq	$(VEC_SIZE * 4 + 1), %rdi
 	andq	$-(VEC_SIZE * 4), %rdi
 	.p2align 4
 L(loop_4x_vec):
 	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
+	vmovdqa	(%rdi), %ymm5
-	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
-	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
-	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 	/* Leaves only CHARS matching esi as 0.	 */
 	vpxor	%ymm5, %ymm0, %ymm1
@ -190,62 +202,102 @@ L(loop_4x_vec):
 	VPMINU	%ymm1, %ymm2, %ymm5
 	VPMINU	%ymm3, %ymm4, %ymm6
-	VPMINU	%ymm5, %ymm6, %ymm5
+	VPMINU	%ymm5, %ymm6, %ymm6
-	VPCMPEQ %ymm5, %ymm9, %ymm5
+	VPCMPEQ	%ymm6, %ymm9, %ymm6
-	vpmovmskb %ymm5, %eax
+	vpmovmskb %ymm6, %ecx
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
 	jz	L(loop_4x_vec)
 	addq	$(VEC_SIZE * 4), %rdi
 	testl	%eax, %eax
 	jz  L(loop_4x_vec)
-	VPCMPEQ %ymm1, %ymm9, %ymm1
+	VPCMPEQ	%ymm1, %ymm9, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x0)
-	VPCMPEQ %ymm2, %ymm9, %ymm2
+
 	VPCMPEQ	%ymm5, %ymm9, %ymm2
 	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(last_vec_x1)
-	VPCMPEQ %ymm3, %ymm9, %ymm3
+	VPCMPEQ	%ymm3, %ymm9, %ymm3
-	VPCMPEQ %ymm4, %ymm9, %ymm4
+	vpmovmskb %ymm3, %eax
-	vpmovmskb %ymm3, %ecx
+	/* rcx has combined result from all 4 VEC. It will only be used
-	vpmovmskb %ymm4, %eax
+	   if the first 3 other VEC all did not contain a match.  */
-	salq	$32, %rax
+	salq	$32, %rcx
-	orq %rcx, %rax
+	orq	%rcx, %rax
-	tzcntq  %rax, %rax
+	tzcntq	%rax, %rax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	subq	$(VEC_SIZE * 2), %rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
+	/* Found CHAR or the null byte.	 */
-	cmovne	%rdx, %rax
+	cmp	(%rdi, %rax), %CHAR_REG
 	jne	L(zero_end)
 # endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 	.p2align 4
 L(last_vec_x0):
 	tzcntl	%eax, %eax
 	addq	$-(VEC_SIZE * 4), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
 	jne	L(zero_end)
 # endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 # ifndef USE_AS_STRCHRNUL
 L(zero_end):
 	xorl	%eax, %eax
 	VZEROUPPER_RETURN
 # endif
 	.p2align 4
 L(last_vec_x1):
 	tzcntl	%eax, %eax
 	subq	$(VEC_SIZE * 3), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
 	jne	L(zero_end)
 # endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 	/* Cold case for crossing page with first load.	 */
 	.p2align 4
 L(cross_page_boundary):
-	andq	$-VEC_SIZE, %rdi
+	movq	%rdi, %rdx
-	andl	$(VEC_SIZE - 1), %ecx
+	/* Align rdi to VEC_SIZE - 1.  */
-
+	orq	$(VEC_SIZE - 1), %rdi
-	vmovdqa	(%rdi), %ymm8
+	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
-	/* Remove the leading bits.	 */
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
-	sarxl	%ecx, %eax, %eax
+	   so no need to manually mod edx.  */
 	sarxl	%edx, %eax, %eax
 	testl	%eax, %eax
-	jz	L(aligned_more)
+	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
 	addq	%rcx, %rdi
 	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
+	xorl	%ecx, %ecx
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
 	cmp	(%rdx, %rax), %CHAR_REG
 	leaq	(%rdx, %rax), %rax
 	cmovne	%rcx, %rax
 # else
 	addq	%rdx, %rax
 # endif
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 END (STRCHR)
 # endif