x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2
Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with
a single vector compare instruction. It is as fast as SSE2 versions for
size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell.
Select AVX2 version on AVX2 machines where vzeroupper is preferred and
AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2,
wcslen-sse2, wcslen-avx2 and wcsnlen-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strlen_avx2,
__strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2,
__wcslen_sse2 and __wcsnlen_avx2.
* sysdeps/x86_64/multiarch/strlen-avx2.S: New file.
* sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strlen.c: Likewise.
* sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strnlen.c: Likewise.
* sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/wcslen.c: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New.
(IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where
vzeroupper is preferred and AVX unaligned load is fast.
2017-06-09 12:18:03 +00:00
|
|
|
/* strlen optimized with SSE2.
|
2023-01-06 21:08:04 +00:00
|
|
|
Copyright (C) 2017-2023 Free Software Foundation, Inc.
|
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2
Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with
a single vector compare instruction. It is as fast as SSE2 versions for
size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell.
Select AVX2 version on AVX2 machines where vzeroupper is preferred and
AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2,
wcslen-sse2, wcslen-avx2 and wcsnlen-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strlen_avx2,
__strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2,
__wcslen_sse2 and __wcsnlen_avx2.
* sysdeps/x86_64/multiarch/strlen-avx2.S: New file.
* sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strlen.c: Likewise.
* sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strnlen.c: Likewise.
* sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/wcslen.c: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New.
(IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where
vzeroupper is preferred and AVX unaligned load is fast.
2017-06-09 12:18:03 +00:00
|
|
|
This file is part of the GNU C Library.
|
|
|
|
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with the GNU C Library; if not, see
|
Prefer https to http for gnu.org and fsf.org URLs
Also, change sources.redhat.com to sourceware.org.
This patch was automatically generated by running the following shell
script, which uses GNU sed, and which avoids modifying files imported
from upstream:
sed -ri '
s,(http|ftp)(://(.*\.)?(gnu|fsf|sourceware)\.org($|[^.]|\.[^a-z])),https\2,g
s,(http|ftp)(://(.*\.)?)sources\.redhat\.com($|[^.]|\.[^a-z]),https\2sourceware.org\4,g
' \
$(find $(git ls-files) -prune -type f \
! -name '*.po' \
! -name 'ChangeLog*' \
! -path COPYING ! -path COPYING.LIB \
! -path manual/fdl-1.3.texi ! -path manual/lgpl-2.1.texi \
! -path manual/texinfo.tex ! -path scripts/config.guess \
! -path scripts/config.sub ! -path scripts/install-sh \
! -path scripts/mkinstalldirs ! -path scripts/move-if-change \
! -path INSTALL ! -path locale/programs/charmap-kw.h \
! -path po/libc.pot ! -path sysdeps/gnu/errlist.c \
! '(' -name configure \
-execdir test -f configure.ac -o -f configure.in ';' ')' \
! '(' -name preconfigure \
-execdir test -f preconfigure.ac ';' ')' \
-print)
and then by running 'make dist-prepare' to regenerate files built
from the altered files, and then executing the following to cleanup:
chmod a+x sysdeps/unix/sysv/linux/riscv/configure
# Omit irrelevant whitespace and comment-only changes,
# perhaps from a slightly-different Autoconf version.
git checkout -f \
sysdeps/csky/configure \
sysdeps/hppa/configure \
sysdeps/riscv/configure \
sysdeps/unix/sysv/linux/csky/configure
# Omit changes that caused a pre-commit check to fail like this:
# remote: *** error: sysdeps/powerpc/powerpc64/ppc-mcount.S: trailing lines
git checkout -f \
sysdeps/powerpc/powerpc64/ppc-mcount.S \
sysdeps/unix/sysv/linux/s390/s390-64/syscall.S
# Omit change that caused a pre-commit check to fail like this:
# remote: *** error: sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S: last line does not end in newline
git checkout -f sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S
2019-09-07 05:40:42 +00:00
|
|
|
<https://www.gnu.org/licenses/>. */
|
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2
Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with
a single vector compare instruction. It is as fast as SSE2 versions for
size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell.
Select AVX2 version on AVX2 machines where vzeroupper is preferred and
AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2,
wcslen-sse2, wcslen-avx2 and wcsnlen-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strlen_avx2,
__strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2,
__wcslen_sse2 and __wcsnlen_avx2.
* sysdeps/x86_64/multiarch/strlen-avx2.S: New file.
* sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strlen.c: Likewise.
* sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strnlen.c: Likewise.
* sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/wcslen.c: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New.
(IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where
vzeroupper is preferred and AVX unaligned load is fast.
2017-06-09 12:18:03 +00:00
|
|
|
|
2022-07-13 23:32:59 +00:00
|
|
|
#include <isa-level.h>
|
2022-07-12 19:29:01 +00:00
|
|
|
|
2022-07-13 23:32:59 +00:00
|
|
|
/* ISA level >= 2 for both strlen and wcslen. wcslen uses `pminud`
|
|
|
|
which is SSE4.1. strlen doesn't have an ISA level == 2
|
|
|
|
implementation so the SSE2 implementation must be built with ISA
|
|
|
|
level == 2. */
|
|
|
|
# if ISA_SHOULD_BUILD (2)
|
2022-07-12 19:29:01 +00:00
|
|
|
|
|
|
|
# include <sysdep.h>
|
|
|
|
|
2022-07-13 23:32:59 +00:00
|
|
|
# ifndef STRLEN
|
|
|
|
# define STRLEN __strlen_sse2
|
|
|
|
# endif
|
|
|
|
|
2022-07-12 19:29:01 +00:00
|
|
|
# ifdef AS_WCSLEN
|
|
|
|
# define PMINU pminud
|
|
|
|
# define PCMPEQ pcmpeqd
|
|
|
|
# define SHIFT_RETURN shrq $2, %rax
|
|
|
|
# else
|
|
|
|
# define PMINU pminub
|
|
|
|
# define PCMPEQ pcmpeqb
|
|
|
|
# define SHIFT_RETURN
|
|
|
|
# endif
|
|
|
|
|
|
|
|
# ifndef SECTION
|
|
|
|
# define SECTION(p) p
|
|
|
|
# endif
|
|
|
|
|
|
|
|
/* Long lived register in strlen(s), strnlen(s, n) are:
|
|
|
|
|
|
|
|
%xmm3 - zero
|
|
|
|
%rdi - s
|
|
|
|
%r10 (s+n) & (~(64-1))
|
|
|
|
%r11 s+n
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
.section SECTION(.text),"ax",@progbits
|
|
|
|
ENTRY(STRLEN)
|
|
|
|
|
|
|
|
/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
|
|
|
|
# define FIND_ZERO \
|
|
|
|
PCMPEQ (%rax), %xmm0; \
|
|
|
|
PCMPEQ 16(%rax), %xmm1; \
|
|
|
|
PCMPEQ 32(%rax), %xmm2; \
|
|
|
|
PCMPEQ 48(%rax), %xmm3; \
|
|
|
|
pmovmskb %xmm0, %esi; \
|
|
|
|
pmovmskb %xmm1, %edx; \
|
|
|
|
pmovmskb %xmm2, %r8d; \
|
|
|
|
pmovmskb %xmm3, %ecx; \
|
|
|
|
salq $16, %rdx; \
|
|
|
|
salq $16, %rcx; \
|
|
|
|
orq %rsi, %rdx; \
|
|
|
|
orq %r8, %rcx; \
|
|
|
|
salq $32, %rcx; \
|
|
|
|
orq %rcx, %rdx;
|
|
|
|
|
|
|
|
# ifdef AS_STRNLEN
|
|
|
|
/* Do not read anything when n==0. */
|
|
|
|
test %RSI_LP, %RSI_LP
|
|
|
|
jne L(n_nonzero)
|
|
|
|
xor %rax, %rax
|
|
|
|
ret
|
|
|
|
L(n_nonzero):
|
|
|
|
# ifdef AS_WCSLEN
|
|
|
|
/* Check for overflow from maxlen * sizeof(wchar_t). If it would
|
|
|
|
overflow the only way this program doesn't have undefined behavior
|
|
|
|
is if there is a null terminator in valid memory so wcslen will
|
|
|
|
suffice. */
|
|
|
|
mov %RSI_LP, %R10_LP
|
|
|
|
sar $62, %R10_LP
|
2022-07-13 23:32:59 +00:00
|
|
|
jnz OVERFLOW_STRLEN
|
2022-07-12 19:29:01 +00:00
|
|
|
sal $2, %RSI_LP
|
|
|
|
# endif
|
|
|
|
|
|
|
|
/* Initialize long lived registers. */
|
|
|
|
add %RDI_LP, %RSI_LP
|
|
|
|
mov %RSI_LP, %R10_LP
|
|
|
|
and $-64, %R10_LP
|
|
|
|
mov %RSI_LP, %R11_LP
|
|
|
|
# endif
|
|
|
|
|
|
|
|
pxor %xmm0, %xmm0
|
|
|
|
pxor %xmm1, %xmm1
|
|
|
|
pxor %xmm2, %xmm2
|
|
|
|
pxor %xmm3, %xmm3
|
|
|
|
movq %rdi, %rax
|
|
|
|
movq %rdi, %rcx
|
|
|
|
andq $4095, %rcx
|
|
|
|
/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
|
|
|
|
cmpq $4047, %rcx
|
|
|
|
/* We cannot unify this branching as it would be ~6 cycles slower. */
|
|
|
|
ja L(cross_page)
|
|
|
|
|
|
|
|
# ifdef AS_STRNLEN
|
|
|
|
/* Test if end is among first 64 bytes. */
|
|
|
|
# define STRNLEN_PROLOG \
|
|
|
|
mov %r11, %rsi; \
|
|
|
|
subq %rax, %rsi; \
|
|
|
|
andq $-64, %rax; \
|
|
|
|
testq $-64, %rsi; \
|
|
|
|
je L(strnlen_ret)
|
|
|
|
# else
|
|
|
|
# define STRNLEN_PROLOG andq $-64, %rax;
|
|
|
|
# endif
|
|
|
|
|
|
|
|
/* Ignore bits in mask that come before start of string. */
|
|
|
|
# define PROLOG(lab) \
|
|
|
|
movq %rdi, %rcx; \
|
|
|
|
xorq %rax, %rcx; \
|
|
|
|
STRNLEN_PROLOG; \
|
|
|
|
sarq %cl, %rdx; \
|
|
|
|
test %rdx, %rdx; \
|
|
|
|
je L(lab); \
|
|
|
|
bsfq %rdx, %rax; \
|
|
|
|
SHIFT_RETURN; \
|
|
|
|
ret
|
|
|
|
|
|
|
|
# ifdef AS_STRNLEN
|
|
|
|
andq $-16, %rax
|
|
|
|
FIND_ZERO
|
|
|
|
# else
|
|
|
|
/* Test first 16 bytes unaligned. */
|
|
|
|
movdqu (%rax), %xmm4
|
|
|
|
PCMPEQ %xmm0, %xmm4
|
|
|
|
pmovmskb %xmm4, %edx
|
|
|
|
test %edx, %edx
|
|
|
|
je L(next48_bytes)
|
|
|
|
bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
|
|
|
|
SHIFT_RETURN
|
|
|
|
ret
|
|
|
|
|
|
|
|
L(next48_bytes):
|
|
|
|
/* Same as FIND_ZERO except we do not check first 16 bytes. */
|
|
|
|
andq $-16, %rax
|
|
|
|
PCMPEQ 16(%rax), %xmm1
|
|
|
|
PCMPEQ 32(%rax), %xmm2
|
|
|
|
PCMPEQ 48(%rax), %xmm3
|
|
|
|
pmovmskb %xmm1, %edx
|
|
|
|
pmovmskb %xmm2, %r8d
|
|
|
|
pmovmskb %xmm3, %ecx
|
|
|
|
salq $16, %rdx
|
|
|
|
salq $16, %rcx
|
|
|
|
orq %r8, %rcx
|
|
|
|
salq $32, %rcx
|
|
|
|
orq %rcx, %rdx
|
|
|
|
# endif
|
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2
Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with
a single vector compare instruction. It is as fast as SSE2 versions for
size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell.
Select AVX2 version on AVX2 machines where vzeroupper is preferred and
AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2,
wcslen-sse2, wcslen-avx2 and wcsnlen-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strlen_avx2,
__strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2,
__wcslen_sse2 and __wcsnlen_avx2.
* sysdeps/x86_64/multiarch/strlen-avx2.S: New file.
* sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strlen.c: Likewise.
* sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strnlen.c: Likewise.
* sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/wcslen.c: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New.
(IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where
vzeroupper is preferred and AVX unaligned load is fast.
2017-06-09 12:18:03 +00:00
|
|
|
|
2022-07-12 19:29:01 +00:00
|
|
|
/* When no zero byte is found xmm1-3 are zero so we do not have to
|
|
|
|
zero them. */
|
|
|
|
PROLOG(loop)
|
|
|
|
|
|
|
|
.p2align 4
|
|
|
|
L(cross_page):
|
|
|
|
andq $-64, %rax
|
|
|
|
FIND_ZERO
|
|
|
|
PROLOG(loop_init)
|
|
|
|
|
|
|
|
# ifdef AS_STRNLEN
|
|
|
|
/* We must do this check to correctly handle strnlen (s, -1). */
|
|
|
|
L(strnlen_ret):
|
|
|
|
bts %rsi, %rdx
|
|
|
|
sarq %cl, %rdx
|
|
|
|
test %rdx, %rdx
|
|
|
|
je L(loop_init)
|
|
|
|
bsfq %rdx, %rax
|
|
|
|
SHIFT_RETURN
|
|
|
|
ret
|
|
|
|
# endif
|
|
|
|
.p2align 4
|
|
|
|
L(loop_init):
|
|
|
|
pxor %xmm1, %xmm1
|
|
|
|
pxor %xmm2, %xmm2
|
|
|
|
pxor %xmm3, %xmm3
|
|
|
|
# ifdef AS_STRNLEN
|
|
|
|
.p2align 4
|
|
|
|
L(loop):
|
|
|
|
|
|
|
|
addq $64, %rax
|
|
|
|
cmpq %rax, %r10
|
|
|
|
je L(exit_end)
|
|
|
|
|
|
|
|
movdqa (%rax), %xmm0
|
|
|
|
PMINU 16(%rax), %xmm0
|
|
|
|
PMINU 32(%rax), %xmm0
|
|
|
|
PMINU 48(%rax), %xmm0
|
|
|
|
PCMPEQ %xmm3, %xmm0
|
|
|
|
pmovmskb %xmm0, %edx
|
|
|
|
testl %edx, %edx
|
|
|
|
jne L(exit)
|
|
|
|
jmp L(loop)
|
|
|
|
|
|
|
|
.p2align 4
|
|
|
|
L(exit_end):
|
|
|
|
cmp %rax, %r11
|
|
|
|
je L(first) /* Do not read when end is at page boundary. */
|
|
|
|
pxor %xmm0, %xmm0
|
|
|
|
FIND_ZERO
|
|
|
|
|
|
|
|
L(first):
|
|
|
|
bts %r11, %rdx
|
|
|
|
bsfq %rdx, %rdx
|
|
|
|
addq %rdx, %rax
|
|
|
|
subq %rdi, %rax
|
|
|
|
SHIFT_RETURN
|
|
|
|
ret
|
|
|
|
|
|
|
|
.p2align 4
|
|
|
|
L(exit):
|
|
|
|
pxor %xmm0, %xmm0
|
|
|
|
FIND_ZERO
|
|
|
|
|
|
|
|
bsfq %rdx, %rdx
|
|
|
|
addq %rdx, %rax
|
|
|
|
subq %rdi, %rax
|
|
|
|
SHIFT_RETURN
|
|
|
|
ret
|
|
|
|
|
|
|
|
# else
|
|
|
|
|
|
|
|
/* Main loop. Unrolled twice to improve L2 cache performance on core2. */
|
|
|
|
.p2align 4
|
|
|
|
L(loop):
|
|
|
|
|
|
|
|
movdqa 64(%rax), %xmm0
|
|
|
|
PMINU 80(%rax), %xmm0
|
|
|
|
PMINU 96(%rax), %xmm0
|
|
|
|
PMINU 112(%rax), %xmm0
|
|
|
|
PCMPEQ %xmm3, %xmm0
|
|
|
|
pmovmskb %xmm0, %edx
|
|
|
|
testl %edx, %edx
|
|
|
|
jne L(exit64)
|
|
|
|
|
|
|
|
subq $-128, %rax
|
|
|
|
|
|
|
|
movdqa (%rax), %xmm0
|
|
|
|
PMINU 16(%rax), %xmm0
|
|
|
|
PMINU 32(%rax), %xmm0
|
|
|
|
PMINU 48(%rax), %xmm0
|
|
|
|
PCMPEQ %xmm3, %xmm0
|
|
|
|
pmovmskb %xmm0, %edx
|
|
|
|
testl %edx, %edx
|
|
|
|
jne L(exit0)
|
|
|
|
jmp L(loop)
|
|
|
|
|
|
|
|
.p2align 4
|
|
|
|
L(exit64):
|
|
|
|
addq $64, %rax
|
|
|
|
L(exit0):
|
|
|
|
pxor %xmm0, %xmm0
|
|
|
|
FIND_ZERO
|
|
|
|
|
|
|
|
bsfq %rdx, %rdx
|
|
|
|
addq %rdx, %rax
|
|
|
|
subq %rdi, %rax
|
|
|
|
SHIFT_RETURN
|
|
|
|
ret
|
|
|
|
|
|
|
|
# endif
|
|
|
|
|
|
|
|
END(STRLEN)
|
|
|
|
#endif
|