glibc/sysdeps/x86_64/multiarch/strlen-sse2.S

/* strlen optimized with SSE2.
   Copyright (C) 2017-2023 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <isa-level.h>

/* ISA level >= 2 for both strlen and wcslen.  wcslen uses `pminud`
   which is SSE4.1. strlen doesn't have an ISA level == 2
   implementation so the SSE2 implementation must be built with ISA
   level == 2.  */
# if ISA_SHOULD_BUILD (2)

# include <sysdep.h>

# ifndef STRLEN
#  define STRLEN	__strlen_sse2
# endif

# ifdef AS_WCSLEN
#  define PMINU		pminud
#  define PCMPEQ		pcmpeqd
#  define SHIFT_RETURN	shrq $2, %rax
# else
#  define PMINU		pminub
#  define PCMPEQ		pcmpeqb
#  define SHIFT_RETURN
# endif

# ifndef SECTION
#  define SECTION(p)	p
# endif

/* Long lived register in strlen(s), strnlen(s, n) are:

	%xmm3 - zero
	%rdi   - s
	%r10  (s+n) & (~(64-1))
	%r11   s+n
*/


	.section SECTION(.text),"ax",@progbits
ENTRY(STRLEN)

/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
# define FIND_ZERO	\
	PCMPEQ	(%rax), %xmm0;	\
	PCMPEQ	16(%rax), %xmm1;	\
	PCMPEQ	32(%rax), %xmm2;	\
	PCMPEQ	48(%rax), %xmm3;	\
	pmovmskb	%xmm0, %esi;	\
	pmovmskb	%xmm1, %edx;	\
	pmovmskb	%xmm2, %r8d;	\
	pmovmskb	%xmm3, %ecx;	\
	salq	$16, %rdx;	\
	salq	$16, %rcx;	\
	orq	%rsi, %rdx;	\
	orq	%r8, %rcx;	\
	salq	$32, %rcx;	\
	orq	%rcx, %rdx;

# ifdef AS_STRNLEN
/* Do not read anything when n==0.  */
	test	%RSI_LP, %RSI_LP
	jne	L(n_nonzero)
	xor	%rax, %rax
	ret
L(n_nonzero):
#  ifdef AS_WCSLEN
/* Check for overflow from maxlen * sizeof(wchar_t). If it would
   overflow the only way this program doesn't have undefined behavior
   is if there is a null terminator in valid memory so wcslen will
   suffice.  */
	mov	%RSI_LP, %R10_LP
	sar	$62, %R10_LP
	jnz	OVERFLOW_STRLEN
	sal	$2, %RSI_LP
#  endif

/* Initialize long lived registers.  */
	add	%RDI_LP, %RSI_LP
	mov	%RSI_LP, %R10_LP
	and	$-64, %R10_LP
	mov	%RSI_LP, %R11_LP
# endif

	pxor	%xmm0, %xmm0
	pxor	%xmm1, %xmm1
	pxor	%xmm2, %xmm2
	pxor	%xmm3, %xmm3
	movq	%rdi, %rax
	movq	%rdi, %rcx
	andq	$4095, %rcx
/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
	cmpq	$4047, %rcx
/* We cannot unify this branching as it would be ~6 cycles slower.  */
	ja	L(cross_page)

# ifdef AS_STRNLEN
/* Test if end is among first 64 bytes.  */
#  define STRNLEN_PROLOG	\
	mov	%r11, %rsi;	\
	subq	%rax, %rsi;	\
	andq	$-64, %rax;	\
	testq	$-64, %rsi;	\
	je	L(strnlen_ret)
# else
#  define STRNLEN_PROLOG  andq $-64, %rax;
# endif

/* Ignore bits in mask that come before start of string.  */
# define PROLOG(lab)	\
	movq	%rdi, %rcx;	\
	xorq	%rax, %rcx;	\
	STRNLEN_PROLOG;	\
	sarq	%cl, %rdx;	\
	test	%rdx, %rdx;	\
	je	L(lab);	\
	bsfq	%rdx, %rax;	\
	SHIFT_RETURN;		\
	ret

# ifdef AS_STRNLEN
	andq	$-16, %rax
	FIND_ZERO
# else
	/* Test first 16 bytes unaligned.  */
	movdqu	(%rax), %xmm4
	PCMPEQ	%xmm0, %xmm4
	pmovmskb	%xmm4, %edx
	test	%edx, %edx
	je 	L(next48_bytes)
	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
	SHIFT_RETURN
	ret

L(next48_bytes):
/* Same as FIND_ZERO except we do not check first 16 bytes.  */
	andq	$-16, %rax
	PCMPEQ 16(%rax), %xmm1
	PCMPEQ 32(%rax), %xmm2
	PCMPEQ 48(%rax), %xmm3
	pmovmskb	%xmm1, %edx
	pmovmskb	%xmm2, %r8d
	pmovmskb	%xmm3, %ecx
	salq	$16, %rdx
	salq	$16, %rcx
	orq	%r8, %rcx
	salq	$32, %rcx
	orq	%rcx, %rdx
# endif

	/* When no zero byte is found xmm1-3 are zero so we do not have to
	   zero them.  */
	PROLOG(loop)

	.p2align 4
L(cross_page):
	andq	$-64, %rax
	FIND_ZERO
	PROLOG(loop_init)

# ifdef AS_STRNLEN
/* We must do this check to correctly handle strnlen (s, -1).  */
L(strnlen_ret):
	bts	%rsi, %rdx
	sarq	%cl, %rdx
	test	%rdx, %rdx
	je	L(loop_init)
	bsfq	%rdx, %rax
	SHIFT_RETURN
	ret
# endif
	.p2align 4
L(loop_init):
	pxor	%xmm1, %xmm1
	pxor	%xmm2, %xmm2
	pxor	%xmm3, %xmm3
# ifdef AS_STRNLEN
	.p2align 4
L(loop):

	addq	$64, %rax
	cmpq	%rax, %r10
	je	L(exit_end)

	movdqa	(%rax), %xmm0
	PMINU	16(%rax), %xmm0
	PMINU	32(%rax), %xmm0
	PMINU	48(%rax), %xmm0
	PCMPEQ	%xmm3, %xmm0
	pmovmskb	%xmm0, %edx
	testl	%edx, %edx
	jne	L(exit)
	jmp	L(loop)

	.p2align 4
L(exit_end):
	cmp	%rax, %r11
	je	L(first) /* Do not read when end is at page boundary.  */
	pxor	%xmm0, %xmm0
	FIND_ZERO

L(first):
	bts	%r11, %rdx
	bsfq	%rdx, %rdx
	addq	%rdx, %rax
	subq	%rdi, %rax
	SHIFT_RETURN
	ret

	.p2align 4
L(exit):
	pxor	%xmm0, %xmm0
	FIND_ZERO

	bsfq	%rdx, %rdx
	addq	%rdx, %rax
	subq	%rdi, %rax
	SHIFT_RETURN
	ret

# else

	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
	.p2align 4
L(loop):

	movdqa	64(%rax), %xmm0
	PMINU	80(%rax), %xmm0
	PMINU	96(%rax), %xmm0
	PMINU	112(%rax), %xmm0
	PCMPEQ	%xmm3, %xmm0
	pmovmskb	%xmm0, %edx
	testl	%edx, %edx
	jne	L(exit64)

	subq	$-128, %rax

	movdqa	(%rax), %xmm0
	PMINU	16(%rax), %xmm0
	PMINU	32(%rax), %xmm0
	PMINU	48(%rax), %xmm0
	PCMPEQ	%xmm3, %xmm0
	pmovmskb	%xmm0, %edx
	testl	%edx, %edx
	jne	L(exit0)
	jmp	L(loop)

	.p2align 4
L(exit64):
	addq	$64, %rax
L(exit0):
	pxor	%xmm0, %xmm0
	FIND_ZERO

	bsfq	%rdx, %rdx
	addq	%rdx, %rax
	subq	%rdi, %rax
	SHIFT_RETURN
	ret

# endif

END(STRLEN)
#endif
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. 2017-06-09 12:18:03 +00:00			`/* strlen optimized with SSE2.`
Update copyright dates with scripts/update-copyrights 2023-01-06 21:08:04 +00:00			`Copyright (C) 2017-2023 Free Software Foundation, Inc.`
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. 2017-06-09 12:18:03 +00:00			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with the GNU C Library; if not, see`
Prefer https to http for gnu.org and fsf.org URLs Also, change sources.redhat.com to sourceware.org. This patch was automatically generated by running the following shell script, which uses GNU sed, and which avoids modifying files imported from upstream: sed -ri ' s,(http\|ftp)(://(.\.)?(gnu\|fsf\|sourceware)\.org($\|[^.]\|\.[^a-z])),https\2,g s,(http\|ftp)(://(.\.)?)sources\.redhat\.com($\|[^.]\|\.[^a-z]),https\2sourceware.org\4,g ' \ $(find $(git ls-files) -prune -type f \ ! -name '.po' \ ! -name 'ChangeLog' \ ! -path COPYING ! -path COPYING.LIB \ ! -path manual/fdl-1.3.texi ! -path manual/lgpl-2.1.texi \ ! -path manual/texinfo.tex ! -path scripts/config.guess \ ! -path scripts/config.sub ! -path scripts/install-sh \ ! -path scripts/mkinstalldirs ! -path scripts/move-if-change \ ! -path INSTALL ! -path locale/programs/charmap-kw.h \ ! -path po/libc.pot ! -path sysdeps/gnu/errlist.c \ ! '(' -name configure \ -execdir test -f configure.ac -o -f configure.in ';' ')' \ ! '(' -name preconfigure \ -execdir test -f preconfigure.ac ';' ')' \ -print) and then by running 'make dist-prepare' to regenerate files built from the altered files, and then executing the following to cleanup: chmod a+x sysdeps/unix/sysv/linux/riscv/configure # Omit irrelevant whitespace and comment-only changes, # perhaps from a slightly-different Autoconf version. git checkout -f \ sysdeps/csky/configure \ sysdeps/hppa/configure \ sysdeps/riscv/configure \ sysdeps/unix/sysv/linux/csky/configure # Omit changes that caused a pre-commit check to fail like this: # remote: * error: sysdeps/powerpc/powerpc64/ppc-mcount.S: trailing lines git checkout -f \ sysdeps/powerpc/powerpc64/ppc-mcount.S \ sysdeps/unix/sysv/linux/s390/s390-64/syscall.S # Omit change that caused a pre-commit check to fail like this: # remote: * error: sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S: last line does not end in newline git checkout -f sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S 2019-09-07 05:40:42 +00:00			`<https://www.gnu.org/licenses/>. */`
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. 2017-06-09 12:18:03 +00:00
x86: Add support to build strcmp/strlen/strchr with explicit ISA level 1. Add default ISA level selection in non-multiarch/rtld implementations. 2. Add ISA level build guards to different implementations. - I.e strcmp-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (strcmp-evex.S). 3. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch. 2022-07-13 23:32:59 +00:00			`#include <isa-level.h>`
x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. 2022-07-12 19:29:01 +00:00
x86: Add support to build strcmp/strlen/strchr with explicit ISA level 1. Add default ISA level selection in non-multiarch/rtld implementations. 2. Add ISA level build guards to different implementations. - I.e strcmp-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (strcmp-evex.S). 3. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch. 2022-07-13 23:32:59 +00:00			/* ISA level >= 2 for both strlen and wcslen. wcslen uses `pminud`
			`which is SSE4.1. strlen doesn't have an ISA level == 2`
			`implementation so the SSE2 implementation must be built with ISA`
			`level == 2. */`
			`# if ISA_SHOULD_BUILD (2)`
x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. 2022-07-12 19:29:01 +00:00
			`# include <sysdep.h>`

x86: Add support to build strcmp/strlen/strchr with explicit ISA level 1. Add default ISA level selection in non-multiarch/rtld implementations. 2. Add ISA level build guards to different implementations. - I.e strcmp-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (strcmp-evex.S). 3. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch. 2022-07-13 23:32:59 +00:00			`# ifndef STRLEN`
			`# define STRLEN __strlen_sse2`
			`# endif`

x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. 2022-07-12 19:29:01 +00:00			`# ifdef AS_WCSLEN`
			`# define PMINU pminud`
			`# define PCMPEQ pcmpeqd`
			`# define SHIFT_RETURN shrq $2, %rax`
			`# else`
			`# define PMINU pminub`
			`# define PCMPEQ pcmpeqb`
			`# define SHIFT_RETURN`
			`# endif`

			`# ifndef SECTION`
			`# define SECTION(p) p`
			`# endif`

			`/* Long lived register in strlen(s), strnlen(s, n) are:`

			`%xmm3 - zero`
			`%rdi - s`
			`%r10 (s+n) & (~(64-1))`
			`%r11 s+n`
			`*/`


			`.section SECTION(.text),"ax",@progbits`
			`ENTRY(STRLEN)`

			`/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */`
			`# define FIND_ZERO \`
			`PCMPEQ (%rax), %xmm0; \`
			`PCMPEQ 16(%rax), %xmm1; \`
			`PCMPEQ 32(%rax), %xmm2; \`
			`PCMPEQ 48(%rax), %xmm3; \`
			`pmovmskb %xmm0, %esi; \`
			`pmovmskb %xmm1, %edx; \`
			`pmovmskb %xmm2, %r8d; \`
			`pmovmskb %xmm3, %ecx; \`
			`salq $16, %rdx; \`
			`salq $16, %rcx; \`
			`orq %rsi, %rdx; \`
			`orq %r8, %rcx; \`
			`salq $32, %rcx; \`
			`orq %rcx, %rdx;`

			`# ifdef AS_STRNLEN`
			`/* Do not read anything when n==0. */`
			`test %RSI_LP, %RSI_LP`
			`jne L(n_nonzero)`
			`xor %rax, %rax`
			`ret`
			`L(n_nonzero):`
			`# ifdef AS_WCSLEN`
			`/* Check for overflow from maxlen * sizeof(wchar_t). If it would`
			`overflow the only way this program doesn't have undefined behavior`
			`is if there is a null terminator in valid memory so wcslen will`
			`suffice. */`
			`mov %RSI_LP, %R10_LP`
			`sar $62, %R10_LP`
x86: Add support to build strcmp/strlen/strchr with explicit ISA level 1. Add default ISA level selection in non-multiarch/rtld implementations. 2. Add ISA level build guards to different implementations. - I.e strcmp-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (strcmp-evex.S). 3. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch. 2022-07-13 23:32:59 +00:00			`jnz OVERFLOW_STRLEN`
x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. 2022-07-12 19:29:01 +00:00			`sal $2, %RSI_LP`
			`# endif`

			`/* Initialize long lived registers. */`
			`add %RDI_LP, %RSI_LP`
			`mov %RSI_LP, %R10_LP`
			`and $-64, %R10_LP`
			`mov %RSI_LP, %R11_LP`
			`# endif`

			`pxor %xmm0, %xmm0`
			`pxor %xmm1, %xmm1`
			`pxor %xmm2, %xmm2`
			`pxor %xmm3, %xmm3`
			`movq %rdi, %rax`
			`movq %rdi, %rcx`
			`andq $4095, %rcx`
			`/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */`
			`cmpq $4047, %rcx`
			`/* We cannot unify this branching as it would be ~6 cycles slower. */`
			`ja L(cross_page)`

			`# ifdef AS_STRNLEN`
			`/* Test if end is among first 64 bytes. */`
			`# define STRNLEN_PROLOG \`
			`mov %r11, %rsi; \`
			`subq %rax, %rsi; \`
			`andq $-64, %rax; \`
			`testq $-64, %rsi; \`
			`je L(strnlen_ret)`
			`# else`
			`# define STRNLEN_PROLOG andq $-64, %rax;`
			`# endif`

			`/* Ignore bits in mask that come before start of string. */`
			`# define PROLOG(lab) \`
			`movq %rdi, %rcx; \`
			`xorq %rax, %rcx; \`
			`STRNLEN_PROLOG; \`
			`sarq %cl, %rdx; \`
			`test %rdx, %rdx; \`
			`je L(lab); \`
			`bsfq %rdx, %rax; \`
			`SHIFT_RETURN; \`
			`ret`

			`# ifdef AS_STRNLEN`
			`andq $-16, %rax`
			`FIND_ZERO`
			`# else`
			`/* Test first 16 bytes unaligned. */`
			`movdqu (%rax), %xmm4`
			`PCMPEQ %xmm0, %xmm4`
			`pmovmskb %xmm4, %edx`
			`test %edx, %edx`
			`je L(next48_bytes)`
			`bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */`
			`SHIFT_RETURN`
			`ret`

			`L(next48_bytes):`
			`/* Same as FIND_ZERO except we do not check first 16 bytes. */`
			`andq $-16, %rax`
			`PCMPEQ 16(%rax), %xmm1`
			`PCMPEQ 32(%rax), %xmm2`
			`PCMPEQ 48(%rax), %xmm3`
			`pmovmskb %xmm1, %edx`
			`pmovmskb %xmm2, %r8d`
			`pmovmskb %xmm3, %ecx`
			`salq $16, %rdx`
			`salq $16, %rcx`
			`orq %r8, %rcx`
			`salq $32, %rcx`
			`orq %rcx, %rdx`
			`# endif`
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. 2017-06-09 12:18:03 +00:00
x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. 2022-07-12 19:29:01 +00:00			`/* When no zero byte is found xmm1-3 are zero so we do not have to`
			`zero them. */`
			`PROLOG(loop)`

			`.p2align 4`
			`L(cross_page):`
			`andq $-64, %rax`
			`FIND_ZERO`
			`PROLOG(loop_init)`

			`# ifdef AS_STRNLEN`
			`/* We must do this check to correctly handle strnlen (s, -1). */`
			`L(strnlen_ret):`
			`bts %rsi, %rdx`
			`sarq %cl, %rdx`
			`test %rdx, %rdx`
			`je L(loop_init)`
			`bsfq %rdx, %rax`
			`SHIFT_RETURN`
			`ret`
			`# endif`
			`.p2align 4`
			`L(loop_init):`
			`pxor %xmm1, %xmm1`
			`pxor %xmm2, %xmm2`
			`pxor %xmm3, %xmm3`
			`# ifdef AS_STRNLEN`
			`.p2align 4`
			`L(loop):`

			`addq $64, %rax`
			`cmpq %rax, %r10`
			`je L(exit_end)`

			`movdqa (%rax), %xmm0`
			`PMINU 16(%rax), %xmm0`
			`PMINU 32(%rax), %xmm0`
			`PMINU 48(%rax), %xmm0`
			`PCMPEQ %xmm3, %xmm0`
			`pmovmskb %xmm0, %edx`
			`testl %edx, %edx`
			`jne L(exit)`
			`jmp L(loop)`

			`.p2align 4`
			`L(exit_end):`
			`cmp %rax, %r11`
			`je L(first) /* Do not read when end is at page boundary. */`
			`pxor %xmm0, %xmm0`
			`FIND_ZERO`

			`L(first):`
			`bts %r11, %rdx`
			`bsfq %rdx, %rdx`
			`addq %rdx, %rax`
			`subq %rdi, %rax`
			`SHIFT_RETURN`
			`ret`

			`.p2align 4`
			`L(exit):`
			`pxor %xmm0, %xmm0`
			`FIND_ZERO`

			`bsfq %rdx, %rdx`
			`addq %rdx, %rax`
			`subq %rdi, %rax`
			`SHIFT_RETURN`
			`ret`

			`# else`

			`/* Main loop. Unrolled twice to improve L2 cache performance on core2. */`
			`.p2align 4`
			`L(loop):`

			`movdqa 64(%rax), %xmm0`
			`PMINU 80(%rax), %xmm0`
			`PMINU 96(%rax), %xmm0`
			`PMINU 112(%rax), %xmm0`
			`PCMPEQ %xmm3, %xmm0`
			`pmovmskb %xmm0, %edx`
			`testl %edx, %edx`
			`jne L(exit64)`

			`subq $-128, %rax`

			`movdqa (%rax), %xmm0`
			`PMINU 16(%rax), %xmm0`
			`PMINU 32(%rax), %xmm0`
			`PMINU 48(%rax), %xmm0`
			`PCMPEQ %xmm3, %xmm0`
			`pmovmskb %xmm0, %edx`
			`testl %edx, %edx`
			`jne L(exit0)`
			`jmp L(loop)`

			`.p2align 4`
			`L(exit64):`
			`addq $64, %rax`
			`L(exit0):`
			`pxor %xmm0, %xmm0`
			`FIND_ZERO`

			`bsfq %rdx, %rdx`
			`addq %rdx, %rax`
			`subq %rdi, %rax`
			`SHIFT_RETURN`
			`ret`

			`# endif`

			`END(STRLEN)`
			`#endif`