glibc/sysdeps/x86_64/multiarch/wcslen-sse2.S

/* wcslen optimized with SSE2.
   Copyright (C) 2017-2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <isa-level.h>

#if ISA_SHOULD_BUILD (1)

# include <sysdep.h>

#ifndef WCSLEN
# define WCSLEN	__wcslen_sse2
#endif


	.text
ENTRY (WCSLEN)
	cmpl	$0, (%rdi)
	jz	L(exit_tail0)
	cmpl	$0, 4(%rdi)
	jz	L(exit_tail1)
	cmpl	$0, 8(%rdi)
	jz	L(exit_tail2)
	cmpl	$0, 12(%rdi)
	jz	L(exit_tail3)
	cmpl	$0, 16(%rdi)
	jz	L(exit_tail4)
	cmpl	$0, 20(%rdi)
	jz	L(exit_tail5)
	cmpl	$0, 24(%rdi)
	jz	L(exit_tail6)
	cmpl	$0, 28(%rdi)
	jz	L(exit_tail7)

	pxor	%xmm0, %xmm0

	lea	32(%rdi), %rax
	addq	$16, %rdi
	and	$-16, %rax

	pcmpeqd	(%rax), %xmm0
	pmovmskb %xmm0, %edx
	pxor	%xmm1, %xmm1
	addq	$16, %rax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%rax), %xmm1
	pmovmskb %xmm1, %edx
	pxor	%xmm2, %xmm2
	addq	$16, %rax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%rax), %xmm2
	pmovmskb %xmm2, %edx
	pxor	%xmm3, %xmm3
	addq	$16, %rax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%rax), %xmm3
	pmovmskb %xmm3, %edx
	addq	$16, %rax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%rax), %xmm0
	pmovmskb %xmm0, %edx
	addq	$16, %rax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%rax), %xmm1
	pmovmskb %xmm1, %edx
	addq	$16, %rax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%rax), %xmm2
	pmovmskb %xmm2, %edx
	addq	$16, %rax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%rax), %xmm3
	pmovmskb %xmm3, %edx
	addq	$16, %rax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%rax), %xmm0
	pmovmskb %xmm0, %edx
	addq	$16, %rax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%rax), %xmm1
	pmovmskb %xmm1, %edx
	addq	$16, %rax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%rax), %xmm2
	pmovmskb %xmm2, %edx
	addq	$16, %rax
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	(%rax), %xmm3
	pmovmskb %xmm3, %edx
	addq	$16, %rax
	test	%edx, %edx
	jnz	L(exit)

	and	$-0x40, %rax

	.p2align 4
L(aligned_64_loop):
	movaps	(%rax), %xmm0
	movaps	16(%rax), %xmm1
	movaps	32(%rax), %xmm2
	movaps	48(%rax), %xmm6

	pminub	%xmm1, %xmm0
	pminub	%xmm6, %xmm2
	pminub	%xmm0, %xmm2
	pcmpeqd	%xmm3, %xmm2
	pmovmskb %xmm2, %edx
	addq	$64, %rax
	test	%edx, %edx
	jz	L(aligned_64_loop)

	pcmpeqd	-64(%rax), %xmm3
	pmovmskb %xmm3, %edx
    addq	$48, %rdi
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	%xmm1, %xmm3
	pmovmskb %xmm3, %edx
    addq	$-16, %rdi
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	-32(%rax), %xmm3
	pmovmskb %xmm3, %edx
    addq	$-16, %rdi
	test	%edx, %edx
	jnz	L(exit)

	pcmpeqd	%xmm6, %xmm3
	pmovmskb %xmm3, %edx
    addq	$-16, %rdi
	test	%edx, %edx
	jz	L(aligned_64_loop)

	.p2align 4
L(exit):
	sub	%rdi, %rax
	shr	$2, %rax
	test	%dl, %dl
	jz	L(exit_high)

	andl	$15, %edx
	jz	L(exit_1)
	ret

	/* No align here. Naturally aligned % 16 == 1.  */
L(exit_high):
	andl	$(15 << 8), %edx
	jz	L(exit_3)
	add	$2, %rax
	ret

	.p2align 3
L(exit_1):
	add	$1, %rax
	ret

	.p2align 3
L(exit_3):
	add	$3, %rax
	ret

	.p2align 3
L(exit_tail0):
	xorl	%eax, %eax
	ret

	.p2align 3
L(exit_tail1):
	movl	$1, %eax
	ret

	.p2align 3
L(exit_tail2):
	movl	$2, %eax
	ret

	.p2align 3
L(exit_tail3):
	movl	$3, %eax
	ret

	.p2align 3
L(exit_tail4):
	movl	$4, %eax
	ret

	.p2align 3
L(exit_tail5):
	movl	$5, %eax
	ret

	.p2align 3
L(exit_tail6):
	movl	$6, %eax
	ret

	.p2align 3
L(exit_tail7):
	movl	$7, %eax
	ret

END (WCSLEN)

#endif
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. 2017-06-09 12:18:03 +00:00			`/* wcslen optimized with SSE2.`
Update copyright dates with scripts/update-copyrights I used these shell commands: ../glibc/scripts/update-copyrights $PWD/../gnulib/build-aux/update-copyright (cd ../glibc && git commit -am"[this commit message]") and then ignored the output, which consisted lines saying "FOO: warning: copyright statement not found" for each of 7061 files FOO. I then removed trailing white space from math/tgmath.h, support/tst-support-open-dev-null-range.c, and sysdeps/x86_64/multiarch/strlen-vec.S, to work around the following obscure pre-commit check failure diagnostics from Savannah. I don't know why I run into these diagnostics whereas others evidently do not. remote: * 912-#endif remote: * 913: remote: * 914- remote: * error: lines with trailing whitespace found ... remote: *** error: sysdeps/unix/sysv/linux/statx_cp.c: trailing lines 2022-01-01 18:54:23 +00:00			`Copyright (C) 2017-2022 Free Software Foundation, Inc.`
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. 2017-06-09 12:18:03 +00:00			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with the GNU C Library; if not, see`
Prefer https to http for gnu.org and fsf.org URLs Also, change sources.redhat.com to sourceware.org. This patch was automatically generated by running the following shell script, which uses GNU sed, and which avoids modifying files imported from upstream: sed -ri ' s,(http\|ftp)(://(.\.)?(gnu\|fsf\|sourceware)\.org($\|[^.]\|\.[^a-z])),https\2,g s,(http\|ftp)(://(.\.)?)sources\.redhat\.com($\|[^.]\|\.[^a-z]),https\2sourceware.org\4,g ' \ $(find $(git ls-files) -prune -type f \ ! -name '.po' \ ! -name 'ChangeLog' \ ! -path COPYING ! -path COPYING.LIB \ ! -path manual/fdl-1.3.texi ! -path manual/lgpl-2.1.texi \ ! -path manual/texinfo.tex ! -path scripts/config.guess \ ! -path scripts/config.sub ! -path scripts/install-sh \ ! -path scripts/mkinstalldirs ! -path scripts/move-if-change \ ! -path INSTALL ! -path locale/programs/charmap-kw.h \ ! -path po/libc.pot ! -path sysdeps/gnu/errlist.c \ ! '(' -name configure \ -execdir test -f configure.ac -o -f configure.in ';' ')' \ ! '(' -name preconfigure \ -execdir test -f preconfigure.ac ';' ')' \ -print) and then by running 'make dist-prepare' to regenerate files built from the altered files, and then executing the following to cleanup: chmod a+x sysdeps/unix/sysv/linux/riscv/configure # Omit irrelevant whitespace and comment-only changes, # perhaps from a slightly-different Autoconf version. git checkout -f \ sysdeps/csky/configure \ sysdeps/hppa/configure \ sysdeps/riscv/configure \ sysdeps/unix/sysv/linux/csky/configure # Omit changes that caused a pre-commit check to fail like this: # remote: * error: sysdeps/powerpc/powerpc64/ppc-mcount.S: trailing lines git checkout -f \ sysdeps/powerpc/powerpc64/ppc-mcount.S \ sysdeps/unix/sysv/linux/s390/s390-64/syscall.S # Omit change that caused a pre-commit check to fail like this: # remote: * error: sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S: last line does not end in newline git checkout -f sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S 2019-09-07 05:40:42 +00:00			`<https://www.gnu.org/licenses/>. */`
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. 2017-06-09 12:18:03 +00:00
x86: Add support to build strcmp/strlen/strchr with explicit ISA level 1. Add default ISA level selection in non-multiarch/rtld implementations. 2. Add ISA level build guards to different implementations. - I.e strcmp-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (strcmp-evex.S). 3. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch. 2022-07-13 23:32:59 +00:00			`#include <isa-level.h>`

			`#if ISA_SHOULD_BUILD (1)`

			`# include <sysdep.h>`

			`#ifndef WCSLEN`
			`# define WCSLEN __wcslen_sse2`
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. 2017-06-09 12:18:03 +00:00			`#endif`

x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. 2022-07-12 19:29:08 +00:00
			`.text`
			`ENTRY (WCSLEN)`
			`cmpl $0, (%rdi)`
			`jz L(exit_tail0)`
			`cmpl $0, 4(%rdi)`
			`jz L(exit_tail1)`
			`cmpl $0, 8(%rdi)`
			`jz L(exit_tail2)`
			`cmpl $0, 12(%rdi)`
			`jz L(exit_tail3)`
			`cmpl $0, 16(%rdi)`
			`jz L(exit_tail4)`
			`cmpl $0, 20(%rdi)`
			`jz L(exit_tail5)`
			`cmpl $0, 24(%rdi)`
			`jz L(exit_tail6)`
			`cmpl $0, 28(%rdi)`
			`jz L(exit_tail7)`

			`pxor %xmm0, %xmm0`

			`lea 32(%rdi), %rax`
			`addq $16, %rdi`
			`and $-16, %rax`

			`pcmpeqd (%rax), %xmm0`
			`pmovmskb %xmm0, %edx`
			`pxor %xmm1, %xmm1`
			`addq $16, %rax`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd (%rax), %xmm1`
			`pmovmskb %xmm1, %edx`
			`pxor %xmm2, %xmm2`
			`addq $16, %rax`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd (%rax), %xmm2`
			`pmovmskb %xmm2, %edx`
			`pxor %xmm3, %xmm3`
			`addq $16, %rax`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd (%rax), %xmm3`
			`pmovmskb %xmm3, %edx`
			`addq $16, %rax`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd (%rax), %xmm0`
			`pmovmskb %xmm0, %edx`
			`addq $16, %rax`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd (%rax), %xmm1`
			`pmovmskb %xmm1, %edx`
			`addq $16, %rax`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd (%rax), %xmm2`
			`pmovmskb %xmm2, %edx`
			`addq $16, %rax`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd (%rax), %xmm3`
			`pmovmskb %xmm3, %edx`
			`addq $16, %rax`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd (%rax), %xmm0`
			`pmovmskb %xmm0, %edx`
			`addq $16, %rax`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd (%rax), %xmm1`
			`pmovmskb %xmm1, %edx`
			`addq $16, %rax`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd (%rax), %xmm2`
			`pmovmskb %xmm2, %edx`
			`addq $16, %rax`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd (%rax), %xmm3`
			`pmovmskb %xmm3, %edx`
			`addq $16, %rax`
			`test %edx, %edx`
			`jnz L(exit)`

			`and $-0x40, %rax`

			`.p2align 4`
			`L(aligned_64_loop):`
			`movaps (%rax), %xmm0`
			`movaps 16(%rax), %xmm1`
			`movaps 32(%rax), %xmm2`
			`movaps 48(%rax), %xmm6`

			`pminub %xmm1, %xmm0`
			`pminub %xmm6, %xmm2`
			`pminub %xmm0, %xmm2`
			`pcmpeqd %xmm3, %xmm2`
			`pmovmskb %xmm2, %edx`
			`addq $64, %rax`
			`test %edx, %edx`
			`jz L(aligned_64_loop)`

			`pcmpeqd -64(%rax), %xmm3`
			`pmovmskb %xmm3, %edx`
			`addq $48, %rdi`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd %xmm1, %xmm3`
			`pmovmskb %xmm3, %edx`
			`addq $-16, %rdi`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd -32(%rax), %xmm3`
			`pmovmskb %xmm3, %edx`
			`addq $-16, %rdi`
			`test %edx, %edx`
			`jnz L(exit)`

			`pcmpeqd %xmm6, %xmm3`
			`pmovmskb %xmm3, %edx`
			`addq $-16, %rdi`
			`test %edx, %edx`
			`jz L(aligned_64_loop)`

			`.p2align 4`
			`L(exit):`
			`sub %rdi, %rax`
			`shr $2, %rax`
			`test %dl, %dl`
			`jz L(exit_high)`

			`andl $15, %edx`
			`jz L(exit_1)`
			`ret`

			`/* No align here. Naturally aligned % 16 == 1. */`
			`L(exit_high):`
			`andl $(15 << 8), %edx`
			`jz L(exit_3)`
			`add $2, %rax`
			`ret`

			`.p2align 3`
			`L(exit_1):`
			`add $1, %rax`
			`ret`

			`.p2align 3`
			`L(exit_3):`
			`add $3, %rax`
			`ret`

			`.p2align 3`
			`L(exit_tail0):`
			`xorl %eax, %eax`
			`ret`

			`.p2align 3`
			`L(exit_tail1):`
			`movl $1, %eax`
			`ret`

			`.p2align 3`
			`L(exit_tail2):`
			`movl $2, %eax`
			`ret`

			`.p2align 3`
			`L(exit_tail3):`
			`movl $3, %eax`
			`ret`

			`.p2align 3`
			`L(exit_tail4):`
			`movl $4, %eax`
			`ret`

			`.p2align 3`
			`L(exit_tail5):`
			`movl $5, %eax`
			`ret`

			`.p2align 3`
			`L(exit_tail6):`
			`movl $6, %eax`
			`ret`

			`.p2align 3`
			`L(exit_tail7):`
			`movl $7, %eax`
			`ret`

			`END (WCSLEN)`
x86: Add support to build strcmp/strlen/strchr with explicit ISA level 1. Add default ISA level selection in non-multiarch/rtld implementations. 2. Add ISA level build guards to different implementations. - I.e strcmp-avx2.S which is ISA level 3 will only build if compiled ISA level <= 3. Otherwise there is no reason to include it as we will always use one of the ISA level 4 implementations (strcmp-evex.S). 3. Refactor the ifunc selector and ifunc implementation list to use the ISA level aware wrapper macros that allow functions below the compiled ISA level (with a guranteed replacement) to be skipped. Tested with and without multiarch on x86_64 for ISA levels: {generic, x86-64-v2, x86-64-v3, x86-64-v4} And m32 with and without multiarch. 2022-07-13 23:32:59 +00:00
			`#endif`