glibc/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S

/* strlen used for begining of str{n}cat using EVEX 256/512.
   Copyright (C) 2011-2023 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */


/* NOTE: This file is meant to be included by strcat-evex or
   strncat-evex and does not standalone.  Before including %rdi
   must be saved in %rax.  */


/* Simple strlen implementation that ends at
   L(strcat_strlen_done).  */
	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
	movq	%rdi, %r8
	andq	$(VEC_SIZE * -1), %r8
	VPCMPEQ	(%r8), %VZERO, %k0
	KMOV	%k0, %VRCX
#ifdef USE_AS_WCSCPY
	subl	%r8d, %edi
	shrl	$2, %edi
#endif
	shrx	%VRDI, %VRCX, %VRCX
#ifdef USE_AS_WCSCPY
	movq	%rax, %rdi
#endif
	test	%VRCX, %VRCX
	jnz	L(bsf_and_done_v0)


	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %k0
	KMOV	%k0, %VRCX
	leaq	(VEC_SIZE)(%r8), %rdi
	test	%VRCX, %VRCX
	jnz	L(bsf_and_done_v0)

	VPCMPEQ	(VEC_SIZE * 2)(%r8), %VZERO, %k0
	KMOV	%k0, %VRCX
	test	%VRCX, %VRCX
	jnz	L(bsf_and_done_v1)

	VPCMPEQ	(VEC_SIZE * 3)(%r8), %VZERO, %k0
	KMOV	%k0, %VRCX
	test	%VRCX, %VRCX
	jnz	L(bsf_and_done_v2)

	VPCMPEQ	(VEC_SIZE * 4)(%r8), %VZERO, %k0
	KMOV	%k0, %VRCX
	test	%VRCX, %VRCX
	jnz	L(bsf_and_done_v3)

	andq	$-(VEC_SIZE * 4), %rdi
	.p2align 4,, 8
L(loop_2x_vec):
	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(0)
	VPMIN	(VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(2)
	VPMIN	(VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
	VPTESTN	%VMM(1), %VMM(1), %k1
	VPTESTN	%VMM(3), %VMM(3), %k3
	subq	$(VEC_SIZE * -4), %rdi
	KORTEST	%k1, %k3
	jz	L(loop_2x_vec)

	VPTESTN	%VMM(0), %VMM(0), %k0
	KMOV	%k0, %VRCX
	test	%VRCX, %VRCX
	jnz	L(bsf_and_done_v0)

	KMOV	%k1, %VRCX
	test	%VRCX, %VRCX
	jnz	L(bsf_and_done_v1)

	VPTESTN	%VMM(2), %VMM(2), %k0
	KMOV	%k0, %VRCX
	test	%VRCX, %VRCX
	jnz	L(bsf_and_done_v2)

	KMOV	%k3, %VRCX
L(bsf_and_done_v3):
	addq	$VEC_SIZE, %rdi
L(bsf_and_done_v2):
	bsf	%VRCX, %VRCX
	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
	jmp	L(strcat_strlen_done)

	.p2align 4,, 4
L(bsf_and_done_v1):
	addq	$VEC_SIZE, %rdi
L(bsf_and_done_v0):
	bsf	%VRCX, %VRCX
#ifdef USE_AS_WCSCPY
	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
#else
	addq	%rcx, %rdi
#endif
L(strcat_strlen_done):
x86: Optimize and shrink st{r\|p}{n}{cat\|cpy}-evex functions Optimizations are: 1. Use more overlapping stores to avoid branches. 2. Reduce how unrolled the aligning copies are (this is more of a code-size save, its a negative for some sizes in terms of perf). 3. Improve the loop a bit (similiar to what we do in strlen with 2x vpminu + kortest instead of 3x vpminu + kmov + test). 4. For st{r\|p}n{cat\|cpy} re-order the branches to minimize the number that are taken. Performance Changes: Times are from N = 10 runs of the benchmark suite and are reported as geometric mean of all ratios of New Implementation / Old Implementation. stpcpy-evex -> 0.922 strcat-evex -> 0.985 strcpy-evex -> 0.880 strncpy-evex -> 0.831 stpncpy-evex -> 0.780 strncat-evex -> 0.958 Code Size Changes: function -> Bytes New / Bytes Old -> Ratio strcat-evex -> 819 / 1874 -> 0.437 strcpy-evex -> 700 / 1074 -> 0.652 stpcpy-evex -> 735 / 1094 -> 0.672 strncpy-evex -> 1397 / 2611 -> 0.535 stpncpy-evex -> 1489 / 2691 -> 0.553 strncat-evex -> 1184 / 2832 -> 0.418 Notes: 1. Because of the significant difference between the implementations they are split into three files. strcpy-evex.S -> strcpy, stpcpy, strcat strncpy-evex.S -> strncpy strncat-evex.S > strncat I couldn't find a way to merge them without making the ifdefs incredibly difficult to follow. 2. All implementations can be made evex512 by including "x86-evex512-vecs.h" at the top. 3. All implementations have an optional define: `USE_EVEX_MASKED_STORE` Setting to one uses evex-masked stores for handling short strings. This saves code size and branches. It's disabled for all implementations are the moment as there are some serious drawbacks to masked stores in certain cases, but that may be fixed on future architectures. Full check passes on x86-64 and build succeeds for all ISA levels w/ and w/o multiarch. 2022-11-09 01:38:38 +00:00			`/* strlen used for begining of str{n}cat using EVEX 256/512.`
Update copyright dates with scripts/update-copyrights 2023-01-06 21:08:04 +00:00			`Copyright (C) 2011-2023 Free Software Foundation, Inc.`
x86: Optimize and shrink st{r\|p}{n}{cat\|cpy}-evex functions Optimizations are: 1. Use more overlapping stores to avoid branches. 2. Reduce how unrolled the aligning copies are (this is more of a code-size save, its a negative for some sizes in terms of perf). 3. Improve the loop a bit (similiar to what we do in strlen with 2x vpminu + kortest instead of 3x vpminu + kmov + test). 4. For st{r\|p}n{cat\|cpy} re-order the branches to minimize the number that are taken. Performance Changes: Times are from N = 10 runs of the benchmark suite and are reported as geometric mean of all ratios of New Implementation / Old Implementation. stpcpy-evex -> 0.922 strcat-evex -> 0.985 strcpy-evex -> 0.880 strncpy-evex -> 0.831 stpncpy-evex -> 0.780 strncat-evex -> 0.958 Code Size Changes: function -> Bytes New / Bytes Old -> Ratio strcat-evex -> 819 / 1874 -> 0.437 strcpy-evex -> 700 / 1074 -> 0.652 stpcpy-evex -> 735 / 1094 -> 0.672 strncpy-evex -> 1397 / 2611 -> 0.535 stpncpy-evex -> 1489 / 2691 -> 0.553 strncat-evex -> 1184 / 2832 -> 0.418 Notes: 1. Because of the significant difference between the implementations they are split into three files. strcpy-evex.S -> strcpy, stpcpy, strcat strncpy-evex.S -> strncpy strncat-evex.S > strncat I couldn't find a way to merge them without making the ifdefs incredibly difficult to follow. 2. All implementations can be made evex512 by including "x86-evex512-vecs.h" at the top. 3. All implementations have an optional define: `USE_EVEX_MASKED_STORE` Setting to one uses evex-masked stores for handling short strings. This saves code size and branches. It's disabled for all implementations are the moment as there are some serious drawbacks to masked stores in certain cases, but that may be fixed on future architectures. Full check passes on x86-64 and build succeeds for all ISA levels w/ and w/o multiarch. 2022-11-09 01:38:38 +00:00			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with the GNU C Library; if not, see`
			`<https://www.gnu.org/licenses/>. */`


			`/* NOTE: This file is meant to be included by strcat-evex or`
			`strncat-evex and does not standalone. Before including %rdi`
			`must be saved in %rax. */`


			`/* Simple strlen implementation that ends at`
			`L(strcat_strlen_done). */`
			`vpxorq %VZERO_128, %VZERO_128, %VZERO_128`
			`movq %rdi, %r8`
			`andq $(VEC_SIZE * -1), %r8`
			`VPCMPEQ (%r8), %VZERO, %k0`
			`KMOV %k0, %VRCX`
			`#ifdef USE_AS_WCSCPY`
			`subl %r8d, %edi`
			`shrl $2, %edi`
			`#endif`
			`shrx %VRDI, %VRCX, %VRCX`
			`#ifdef USE_AS_WCSCPY`
			`movq %rax, %rdi`
			`#endif`
			`test %VRCX, %VRCX`
			`jnz L(bsf_and_done_v0)`


			`VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0`
			`KMOV %k0, %VRCX`
			`leaq (VEC_SIZE)(%r8), %rdi`
			`test %VRCX, %VRCX`
			`jnz L(bsf_and_done_v0)`

			`VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0`
			`KMOV %k0, %VRCX`
			`test %VRCX, %VRCX`
			`jnz L(bsf_and_done_v1)`

			`VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0`
			`KMOV %k0, %VRCX`
			`test %VRCX, %VRCX`
			`jnz L(bsf_and_done_v2)`

			`VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0`
			`KMOV %k0, %VRCX`
			`test %VRCX, %VRCX`
			`jnz L(bsf_and_done_v3)`

			`andq $-(VEC_SIZE * 4), %rdi`
			`.p2align 4,, 8`
			`L(loop_2x_vec):`
			`VMOVA (VEC_SIZE * 4)(%rdi), %VMM(0)`
			`VPMIN (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)`
			`VMOVA (VEC_SIZE * 6)(%rdi), %VMM(2)`
			`VPMIN (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)`
			`VPTESTN %VMM(1), %VMM(1), %k1`
			`VPTESTN %VMM(3), %VMM(3), %k3`
			`subq $(VEC_SIZE * -4), %rdi`
			`KORTEST %k1, %k3`
			`jz L(loop_2x_vec)`

			`VPTESTN %VMM(0), %VMM(0), %k0`
			`KMOV %k0, %VRCX`
			`test %VRCX, %VRCX`
			`jnz L(bsf_and_done_v0)`

			`KMOV %k1, %VRCX`
			`test %VRCX, %VRCX`
			`jnz L(bsf_and_done_v1)`

			`VPTESTN %VMM(2), %VMM(2), %k0`
			`KMOV %k0, %VRCX`
			`test %VRCX, %VRCX`
			`jnz L(bsf_and_done_v2)`

			`KMOV %k3, %VRCX`
			`L(bsf_and_done_v3):`
			`addq $VEC_SIZE, %rdi`
			`L(bsf_and_done_v2):`
			`bsf %VRCX, %VRCX`
			`leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi`
			`jmp L(strcat_strlen_done)`

			`.p2align 4,, 4`
			`L(bsf_and_done_v1):`
			`addq $VEC_SIZE, %rdi`
			`L(bsf_and_done_v0):`
			`bsf %VRCX, %VRCX`
			`#ifdef USE_AS_WCSCPY`
			`leaq (%rdi, %rcx, CHAR_SIZE), %rdi`
			`#else`
			`addq %rcx, %rdi`
			`#endif`
			`L(strcat_strlen_done):`