glibc/sysdeps/x86_64/multiarch/strlen-avx2.S

/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
   Copyright (C) 2017-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#if IS_IN (libc)

# include <sysdep.h>

# ifndef STRLEN
#  define STRLEN	__strlen_avx2
# endif

# ifdef USE_AS_WCSLEN
#  define VPCMPEQ	vpcmpeqd
#  define VPMINU	vpminud
# else
#  define VPCMPEQ	vpcmpeqb
#  define VPMINU	vpminub
# endif

# ifndef VZEROUPPER
#  define VZEROUPPER	vzeroupper
# endif

# define VEC_SIZE 32

	.section .text.avx,"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
	/* Check for zero length.  */
	testq	%rsi, %rsi
	jz	L(zero)
#  ifdef USE_AS_WCSLEN
	shl	$2, %rsi
#  endif
	movq	%rsi, %r8
# endif
	movl	%edi, %ecx
	movq	%rdi, %rdx
	vpxor	%xmm0, %xmm0, %xmm0

	/* Check if we may cross page boundary with one vector load.  */
	andl	$(2 * VEC_SIZE - 1), %ecx
	cmpl	$VEC_SIZE, %ecx
	ja	L(cros_page_boundary)

	/* Check the first VEC_SIZE bytes.  */
	VPCMPEQ (%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax

# ifdef USE_AS_STRNLEN
	jnz	L(first_vec_x0_check)
	/* Adjust length and check the end of data.  */
	subq	$VEC_SIZE, %rsi
	jbe	L(max)
# else
	jnz	L(first_vec_x0)
# endif

	/* Align data for aligned loads in the loop.  */
	addq	$VEC_SIZE, %rdi
	andl	$(VEC_SIZE - 1), %ecx
	andq	$-VEC_SIZE, %rdi

# ifdef USE_AS_STRNLEN
	/* Adjust length.  */
	addq	%rcx, %rsi

	subq	$(VEC_SIZE * 4), %rsi
	jbe	L(last_4x_vec_or_less)
# endif
	jmp	L(more_4x_vec)

	.p2align 4
L(cros_page_boundary):
	andl	$(VEC_SIZE - 1), %ecx
	andq	$-VEC_SIZE, %rdi
	VPCMPEQ (%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	/* Remove the leading bytes.  */
	sarl	%cl, %eax
	testl	%eax, %eax
	jz	L(aligned_more)
	tzcntl	%eax, %eax
# ifdef USE_AS_STRNLEN
	/* Check the end of data.  */
	cmpq	%rax, %rsi
	jbe	L(max)
# endif
	addq	%rdi, %rax
	addq	%rcx, %rax
	subq	%rdx, %rax
# ifdef USE_AS_WCSLEN
	shrq	$2, %rax
# endif
	VZEROUPPER
	ret

	.p2align 4
L(aligned_more):
# ifdef USE_AS_STRNLEN
        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
	    to void possible addition overflow.  */
	negq	%rcx
	addq	$VEC_SIZE, %rcx

	/* Check the end of data.  */
	subq	%rcx, %rsi
	jbe	L(max)
# endif

	addq	$VEC_SIZE, %rdi

# ifdef USE_AS_STRNLEN
	subq	$(VEC_SIZE * 4), %rsi
	jbe	L(last_4x_vec_or_less)
# endif

L(more_4x_vec):
	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
	   since data is only aligned to VEC_SIZE.  */
	VPCMPEQ (%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jnz	L(first_vec_x0)

	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jnz	L(first_vec_x1)

	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jnz	L(first_vec_x2)

	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jnz	L(first_vec_x3)

	addq	$(VEC_SIZE * 4), %rdi

# ifdef USE_AS_STRNLEN
	subq	$(VEC_SIZE * 4), %rsi
	jbe	L(last_4x_vec_or_less)
# endif

	/* Align data to 4 * VEC_SIZE.  */
	movq	%rdi, %rcx
	andl	$(4 * VEC_SIZE - 1), %ecx
	andq	$-(4 * VEC_SIZE), %rdi

# ifdef USE_AS_STRNLEN
	/* Adjust length.  */
	addq	%rcx, %rsi
# endif

	.p2align 4
L(loop_4x_vec):
	/* Compare 4 * VEC at a time forward.  */
	vmovdqa (%rdi), %ymm1
	vmovdqa	VEC_SIZE(%rdi), %ymm2
	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
	VPMINU	%ymm1, %ymm2, %ymm5
	VPMINU	%ymm3, %ymm4, %ymm6
	VPMINU	%ymm5, %ymm6, %ymm5

	VPCMPEQ	%ymm5, %ymm0, %ymm5
	vpmovmskb %ymm5, %eax
	testl	%eax, %eax
	jnz	L(4x_vec_end)

	addq	$(VEC_SIZE * 4), %rdi

# ifndef USE_AS_STRNLEN
	jmp	L(loop_4x_vec)
# else
	subq	$(VEC_SIZE * 4), %rsi
	ja	L(loop_4x_vec)

L(last_4x_vec_or_less):
	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
	addl	$(VEC_SIZE * 2), %esi
	jle	L(last_2x_vec)

	VPCMPEQ (%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jnz	L(first_vec_x0)

	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jnz	L(first_vec_x1)

	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax

	jnz	L(first_vec_x2_check)
	subl	$VEC_SIZE, %esi
	jle	L(max)

	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax

	jnz	L(first_vec_x3_check)
	movq	%r8, %rax
#  ifdef USE_AS_WCSLEN
	shrq	$2, %rax
#  endif
	VZEROUPPER
	ret

	.p2align 4
L(last_2x_vec):
	addl	$(VEC_SIZE * 2), %esi
	VPCMPEQ (%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax

	jnz	L(first_vec_x0_check)
	subl	$VEC_SIZE, %esi
	jle	L(max)

	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jnz	L(first_vec_x1_check)
	movq	%r8, %rax
#  ifdef USE_AS_WCSLEN
	shrq	$2, %rax
#  endif
	VZEROUPPER
	ret

	.p2align 4
L(first_vec_x0_check):
	tzcntl	%eax, %eax
	/* Check the end of data.  */
	cmpq	%rax, %rsi
	jbe	L(max)
	addq	%rdi, %rax
	subq	%rdx, %rax
#  ifdef USE_AS_WCSLEN
	shrq	$2, %rax
#  endif
	VZEROUPPER
	ret

	.p2align 4
L(first_vec_x1_check):
	tzcntl	%eax, %eax
	/* Check the end of data.  */
	cmpq	%rax, %rsi
	jbe	L(max)
	addq	$VEC_SIZE, %rax
	addq	%rdi, %rax
	subq	%rdx, %rax
#  ifdef USE_AS_WCSLEN
	shrq	$2, %rax
#  endif
	VZEROUPPER
	ret

	.p2align 4
L(first_vec_x2_check):
	tzcntl	%eax, %eax
	/* Check the end of data.  */
	cmpq	%rax, %rsi
	jbe	L(max)
	addq	$(VEC_SIZE * 2), %rax
	addq	%rdi, %rax
	subq	%rdx, %rax
#  ifdef USE_AS_WCSLEN
	shrq	$2, %rax
#  endif
	VZEROUPPER
	ret

	.p2align 4
L(first_vec_x3_check):
	tzcntl	%eax, %eax
	/* Check the end of data.  */
	cmpq	%rax, %rsi
	jbe	L(max)
	addq	$(VEC_SIZE * 3), %rax
	addq	%rdi, %rax
	subq	%rdx, %rax
#  ifdef USE_AS_WCSLEN
	shrq	$2, %rax
#  endif
	VZEROUPPER
	ret

	.p2align 4
L(max):
	movq	%r8, %rax
#  ifdef USE_AS_WCSLEN
	shrq	$2, %rax
#  endif
	VZEROUPPER
	ret

	.p2align 4
L(zero):
	xorl	%eax, %eax
	ret
# endif

	.p2align 4
L(first_vec_x0):
	tzcntl	%eax, %eax
	addq	%rdi, %rax
	subq	%rdx, %rax
# ifdef USE_AS_WCSLEN
	shrq	$2, %rax
# endif
	VZEROUPPER
	ret

	.p2align 4
L(first_vec_x1):
	tzcntl	%eax, %eax
	addq	$VEC_SIZE, %rax
	addq	%rdi, %rax
	subq	%rdx, %rax
# ifdef USE_AS_WCSLEN
	shrq	$2, %rax
# endif
	VZEROUPPER
	ret

	.p2align 4
L(first_vec_x2):
	tzcntl	%eax, %eax
	addq	$(VEC_SIZE * 2), %rax
	addq	%rdi, %rax
	subq	%rdx, %rax
# ifdef USE_AS_WCSLEN
	shrq	$2, %rax
# endif
	VZEROUPPER
	ret

	.p2align 4
L(4x_vec_end):
	VPCMPEQ	%ymm1, %ymm0, %ymm1
	vpmovmskb %ymm1, %eax
	testl	%eax, %eax
	jnz	L(first_vec_x0)
	VPCMPEQ %ymm2, %ymm0, %ymm2
	vpmovmskb %ymm2, %eax
	testl	%eax, %eax
	jnz	L(first_vec_x1)
	VPCMPEQ %ymm3, %ymm0, %ymm3
	vpmovmskb %ymm3, %eax
	testl	%eax, %eax
	jnz	L(first_vec_x2)
	VPCMPEQ %ymm4, %ymm0, %ymm4
	vpmovmskb %ymm4, %eax
	testl	%eax, %eax
L(first_vec_x3):
	tzcntl	%eax, %eax
	addq	$(VEC_SIZE * 3), %rax
	addq	%rdi, %rax
	subq	%rdx, %rax
# ifdef USE_AS_WCSLEN
	shrq	$2, %rax
# endif
	VZEROUPPER
	ret

END (STRLEN)
#endif
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. 2017-06-09 12:18:03 +00:00			`/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.`
Update copyright dates with scripts/update-copyrights. * All files with FSF copyright notices: Update copyright dates using scripts/update-copyrights. * locale/programs/charmap-kw.h: Regenerated. * locale/programs/locfile-kw.h: Likewise. 2018-01-01 00:32:25 +00:00			`Copyright (C) 2017-2018 Free Software Foundation, Inc.`
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. 2017-06-09 12:18:03 +00:00			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with the GNU C Library; if not, see`
			`<http://www.gnu.org/licenses/>. */`

			`#if IS_IN (libc)`

			`# include <sysdep.h>`

			`# ifndef STRLEN`
			`# define STRLEN __strlen_avx2`
			`# endif`

			`# ifdef USE_AS_WCSLEN`
			`# define VPCMPEQ vpcmpeqd`
			`# define VPMINU vpminud`
			`# else`
			`# define VPCMPEQ vpcmpeqb`
			`# define VPMINU vpminub`
			`# endif`

			`# ifndef VZEROUPPER`
			`# define VZEROUPPER vzeroupper`
			`# endif`

			`# define VEC_SIZE 32`

			`.section .text.avx,"ax",@progbits`
			`ENTRY (STRLEN)`
			`# ifdef USE_AS_STRNLEN`
			`/* Check for zero length. */`
			`testq %rsi, %rsi`
			`jz L(zero)`
			`# ifdef USE_AS_WCSLEN`
			`shl $2, %rsi`
			`# endif`
			`movq %rsi, %r8`
			`# endif`
			`movl %edi, %ecx`
			`movq %rdi, %rdx`
			`vpxor %xmm0, %xmm0, %xmm0`

			`/* Check if we may cross page boundary with one vector load. */`
			`andl $(2 * VEC_SIZE - 1), %ecx`
			`cmpl $VEC_SIZE, %ecx`
			`ja L(cros_page_boundary)`

			`/* Check the first VEC_SIZE bytes. */`
			`VPCMPEQ (%rdi), %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`testl %eax, %eax`

			`# ifdef USE_AS_STRNLEN`
			`jnz L(first_vec_x0_check)`
			`/* Adjust length and check the end of data. */`
			`subq $VEC_SIZE, %rsi`
			`jbe L(max)`
			`# else`
			`jnz L(first_vec_x0)`
			`# endif`

			`/* Align data for aligned loads in the loop. */`
			`addq $VEC_SIZE, %rdi`
			`andl $(VEC_SIZE - 1), %ecx`
			`andq $-VEC_SIZE, %rdi`

			`# ifdef USE_AS_STRNLEN`
			`/* Adjust length. */`
			`addq %rcx, %rsi`

			`subq $(VEC_SIZE * 4), %rsi`
			`jbe L(last_4x_vec_or_less)`
			`# endif`
			`jmp L(more_4x_vec)`

			`.p2align 4`
			`L(cros_page_boundary):`
			`andl $(VEC_SIZE - 1), %ecx`
			`andq $-VEC_SIZE, %rdi`
			`VPCMPEQ (%rdi), %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`/* Remove the leading bytes. */`
			`sarl %cl, %eax`
			`testl %eax, %eax`
			`jz L(aligned_more)`
			`tzcntl %eax, %eax`
			`# ifdef USE_AS_STRNLEN`
			`/* Check the end of data. */`
			`cmpq %rax, %rsi`
			`jbe L(max)`
			`# endif`
			`addq %rdi, %rax`
			`addq %rcx, %rax`
			`subq %rdx, %rax`
			`# ifdef USE_AS_WCSLEN`
			`shrq $2, %rax`
			`# endif`
			`VZEROUPPER`
			`ret`

			`.p2align 4`
			`L(aligned_more):`
			`# ifdef USE_AS_STRNLEN`
			`/* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"`
			`with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"`
			`to void possible addition overflow. */`
			`negq %rcx`
			`addq $VEC_SIZE, %rcx`

			`/* Check the end of data. */`
			`subq %rcx, %rsi`
			`jbe L(max)`
			`# endif`

			`addq $VEC_SIZE, %rdi`

			`# ifdef USE_AS_STRNLEN`
			`subq $(VEC_SIZE * 4), %rsi`
			`jbe L(last_4x_vec_or_less)`
			`# endif`

			`L(more_4x_vec):`
			`/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time`
			`since data is only aligned to VEC_SIZE. */`
			`VPCMPEQ (%rdi), %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`testl %eax, %eax`
			`jnz L(first_vec_x0)`

			`VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`testl %eax, %eax`
			`jnz L(first_vec_x1)`

			`VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`testl %eax, %eax`
			`jnz L(first_vec_x2)`

			`VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`testl %eax, %eax`
			`jnz L(first_vec_x3)`

			`addq $(VEC_SIZE * 4), %rdi`

			`# ifdef USE_AS_STRNLEN`
			`subq $(VEC_SIZE * 4), %rsi`
			`jbe L(last_4x_vec_or_less)`
			`# endif`

			`/* Align data to 4 * VEC_SIZE. */`
			`movq %rdi, %rcx`
			`andl $(4 * VEC_SIZE - 1), %ecx`
			`andq $-(4 * VEC_SIZE), %rdi`

			`# ifdef USE_AS_STRNLEN`
			`/* Adjust length. */`
			`addq %rcx, %rsi`
			`# endif`

			`.p2align 4`
			`L(loop_4x_vec):`
			`/* Compare 4 * VEC at a time forward. */`
			`vmovdqa (%rdi), %ymm1`
			`vmovdqa VEC_SIZE(%rdi), %ymm2`
			`vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3`
			`vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4`
			`VPMINU %ymm1, %ymm2, %ymm5`
			`VPMINU %ymm3, %ymm4, %ymm6`
			`VPMINU %ymm5, %ymm6, %ymm5`

			`VPCMPEQ %ymm5, %ymm0, %ymm5`
			`vpmovmskb %ymm5, %eax`
			`testl %eax, %eax`
			`jnz L(4x_vec_end)`

			`addq $(VEC_SIZE * 4), %rdi`

			`# ifndef USE_AS_STRNLEN`
			`jmp L(loop_4x_vec)`
			`# else`
			`subq $(VEC_SIZE * 4), %rsi`
			`ja L(loop_4x_vec)`

			`L(last_4x_vec_or_less):`
			`/* Less than 4 * VEC and aligned to VEC_SIZE. */`
			`addl $(VEC_SIZE * 2), %esi`
			`jle L(last_2x_vec)`

			`VPCMPEQ (%rdi), %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`testl %eax, %eax`
			`jnz L(first_vec_x0)`

			`VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`testl %eax, %eax`
			`jnz L(first_vec_x1)`

			`VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`testl %eax, %eax`

			`jnz L(first_vec_x2_check)`
			`subl $VEC_SIZE, %esi`
			`jle L(max)`

			`VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`testl %eax, %eax`

			`jnz L(first_vec_x3_check)`
			`movq %r8, %rax`
			`# ifdef USE_AS_WCSLEN`
			`shrq $2, %rax`
			`# endif`
			`VZEROUPPER`
			`ret`

			`.p2align 4`
			`L(last_2x_vec):`
			`addl $(VEC_SIZE * 2), %esi`
			`VPCMPEQ (%rdi), %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`testl %eax, %eax`

			`jnz L(first_vec_x0_check)`
			`subl $VEC_SIZE, %esi`
			`jle L(max)`

			`VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`testl %eax, %eax`
			`jnz L(first_vec_x1_check)`
			`movq %r8, %rax`
			`# ifdef USE_AS_WCSLEN`
			`shrq $2, %rax`
			`# endif`
			`VZEROUPPER`
			`ret`

			`.p2align 4`
			`L(first_vec_x0_check):`
			`tzcntl %eax, %eax`
			`/* Check the end of data. */`
			`cmpq %rax, %rsi`
			`jbe L(max)`
			`addq %rdi, %rax`
			`subq %rdx, %rax`
			`# ifdef USE_AS_WCSLEN`
			`shrq $2, %rax`
			`# endif`
			`VZEROUPPER`
			`ret`

			`.p2align 4`
			`L(first_vec_x1_check):`
			`tzcntl %eax, %eax`
			`/* Check the end of data. */`
			`cmpq %rax, %rsi`
			`jbe L(max)`
			`addq $VEC_SIZE, %rax`
			`addq %rdi, %rax`
			`subq %rdx, %rax`
			`# ifdef USE_AS_WCSLEN`
			`shrq $2, %rax`
			`# endif`
			`VZEROUPPER`
			`ret`

			`.p2align 4`
			`L(first_vec_x2_check):`
			`tzcntl %eax, %eax`
			`/* Check the end of data. */`
			`cmpq %rax, %rsi`
			`jbe L(max)`
			`addq $(VEC_SIZE * 2), %rax`
			`addq %rdi, %rax`
			`subq %rdx, %rax`
			`# ifdef USE_AS_WCSLEN`
			`shrq $2, %rax`
			`# endif`
			`VZEROUPPER`
			`ret`

			`.p2align 4`
			`L(first_vec_x3_check):`
			`tzcntl %eax, %eax`
			`/* Check the end of data. */`
			`cmpq %rax, %rsi`
			`jbe L(max)`
			`addq $(VEC_SIZE * 3), %rax`
			`addq %rdi, %rax`
			`subq %rdx, %rax`
			`# ifdef USE_AS_WCSLEN`
			`shrq $2, %rax`
			`# endif`
			`VZEROUPPER`
			`ret`

			`.p2align 4`
			`L(max):`
			`movq %r8, %rax`
			`# ifdef USE_AS_WCSLEN`
			`shrq $2, %rax`
			`# endif`
			`VZEROUPPER`
			`ret`

			`.p2align 4`
			`L(zero):`
			`xorl %eax, %eax`
			`ret`
			`# endif`

			`.p2align 4`
			`L(first_vec_x0):`
			`tzcntl %eax, %eax`
			`addq %rdi, %rax`
			`subq %rdx, %rax`
			`# ifdef USE_AS_WCSLEN`
			`shrq $2, %rax`
			`# endif`
			`VZEROUPPER`
			`ret`

			`.p2align 4`
			`L(first_vec_x1):`
			`tzcntl %eax, %eax`
			`addq $VEC_SIZE, %rax`
			`addq %rdi, %rax`
			`subq %rdx, %rax`
			`# ifdef USE_AS_WCSLEN`
			`shrq $2, %rax`
			`# endif`
			`VZEROUPPER`
			`ret`

			`.p2align 4`
			`L(first_vec_x2):`
			`tzcntl %eax, %eax`
			`addq $(VEC_SIZE * 2), %rax`
			`addq %rdi, %rax`
			`subq %rdx, %rax`
			`# ifdef USE_AS_WCSLEN`
			`shrq $2, %rax`
			`# endif`
			`VZEROUPPER`
			`ret`

			`.p2align 4`
			`L(4x_vec_end):`
			`VPCMPEQ %ymm1, %ymm0, %ymm1`
			`vpmovmskb %ymm1, %eax`
			`testl %eax, %eax`
			`jnz L(first_vec_x0)`
			`VPCMPEQ %ymm2, %ymm0, %ymm2`
			`vpmovmskb %ymm2, %eax`
			`testl %eax, %eax`
			`jnz L(first_vec_x1)`
			`VPCMPEQ %ymm3, %ymm0, %ymm3`
			`vpmovmskb %ymm3, %eax`
			`testl %eax, %eax`
			`jnz L(first_vec_x2)`
			`VPCMPEQ %ymm4, %ymm0, %ymm4`
			`vpmovmskb %ymm4, %eax`
			`testl %eax, %eax`
			`L(first_vec_x3):`
			`tzcntl %eax, %eax`
			`addq $(VEC_SIZE * 3), %rax`
			`addq %rdi, %rax`
			`subq %rdx, %rax`
			`# ifdef USE_AS_WCSLEN`
			`shrq $2, %rax`
			`# endif`
			`VZEROUPPER`
			`ret`

			`END (STRLEN)`
			`#endif`