Add x86 SSE strlen.

This commit is contained in:
Ulrich Drepper 2009-08-04 18:15:02 -07:00
parent 02cea47161
commit 2c709c6f05
4 changed files with 166 additions and 6 deletions

View File

@ -1,5 +1,11 @@
2009-08-03 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/i386/i686/multiarch/strcspn.S: Add comments for no
hidden IFUNC functions.
* sysdeps/i386/i686/multiarch/strspn.S: Likewise.
* sysdeps/i386/i686/multiarch/strlen.S: New file.
* sysdeps/i386/i686/multiarch/Makefile [subdir=string]
(sysdep_routines): Add strcspn-c, strpbrk-c, strspn-c, strstr-c, and
strcasestr-c.

View File

@ -83,9 +83,9 @@ END(STRCSPN)
# define END(name) \
cfi_endproc; .size STRCSPN_IA32, .-STRCSPN_IA32
# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal strcspn calls through a PLT.
The speedup we get from using SSE4.2 instruction is likely eaten away
by the indirect call in the PLT. */
/* IFUNC doesn't work with the hidden functions in shared library since
they will be called without setting up EBX needed for PLT which is
used by IFUNC. */
# define libc_hidden_builtin_def(name) \
.globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_IA32
#endif

View File

@ -0,0 +1,154 @@
/* Multiple versions of strlen
Copyright (C) 2009 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <ifunc-defines.h>
/* Define multiple versions only for the definition in libc and for the
DSO. In static binaries, we need strlen before the initialization
happened. */
#if defined SHARED && !defined NOT_IN_libc
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
.globl __i686.get_pc_thunk.bx
.hidden __i686.get_pc_thunk.bx
.p2align 4
.type __i686.get_pc_thunk.bx,@function
__i686.get_pc_thunk.bx:
movl (%esp), %ebx
ret
.text
ENTRY(strlen)
.type strlen, @gnu_indirect_function
pushl %ebx
cfi_adjust_cfa_offset (4)
cfi_rel_offset (ebx, 0)
call __i686.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
jne 1f
call __init_cpu_features
1: leal __strlen_ia32@GOTOFF(%ebx), %eax
testl $(1<<26), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __strlen_sse2@GOTOFF(%ebx), %eax
2: popl %ebx
cfi_adjust_cfa_offset (-4);
cfi_restore (ebx)
ret
END(strlen)
#define CFI_POP(REG) \
cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
#define RETURN popl %esi; CFI_POP (esi); ret
.text
ENTRY (__strlen_sse2)
/*
* This implementation uses SSE instructions to compare up to 16 bytes
* at a time looking for the end of string (null char).
*/
pushl %esi
cfi_adjust_cfa_offset (4)
cfi_rel_offset (%esi, 0)
mov 8(%esp), %eax
mov %eax, %ecx
pxor %xmm0, %xmm0 /* 16 null chars */
mov %eax, %esi
and $15, %ecx
jz 1f /* string is 16 byte aligned */
/*
* Unaligned case. Round down to 16-byte boundary before comparing
* 16 bytes for a null char. The code then compensates for any extra chars
* preceding the start of the string.
*/
and $-16, %esi
pcmpeqb (%esi), %xmm0
lea 16(%eax), %esi
pmovmskb %xmm0, %edx
shr %cl, %edx /* Compensate for bytes preceding the string */
test %edx, %edx
jnz 2f
sub %ecx, %esi /* no null, adjust to next 16-byte boundary */
pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */
.p2align 4
1: /* 16 byte aligned */
pcmpeqb (%esi), %xmm0 /* look for null bytes */
pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */
add $16, %esi /* prepare to search next 16 bytes */
test %edx, %edx /* if no null byte, %edx must be 0 */
jnz 2f /* found a null */
pcmpeqb (%esi), %xmm0
pmovmskb %xmm0, %edx
add $16, %esi
test %edx, %edx
jnz 2f
pcmpeqb (%esi), %xmm0
pmovmskb %xmm0, %edx
add $16, %esi
test %edx, %edx
jnz 2f
pcmpeqb (%esi), %xmm0
pmovmskb %xmm0, %edx
add $16, %esi
test %edx, %edx
jz 1b
2:
neg %eax
lea -16(%eax, %esi), %eax /* calculate exact offset */
bsf %edx, %ecx /* Least significant 1 bit is index of null */
add %ecx, %eax
popl %esi
cfi_adjust_cfa_offset (-4)
cfi_restore (%esi)
ret
END (__strlen_sse2)
# undef ENTRY
# define ENTRY(name) \
.type __strlen_ia32, @function; \
.globl __strlen_ia32; \
.p2align 4
__strlen_ia32: cfi_startproc; \
CALL_MCOUNT
# undef END
# define END(name) \
cfi_endproc; .size __strlen_ia32, .-__strlen_ia32
# undef libc_hidden_builtin_def
/* IFUNC doesn't work with the hidden functions in shared library since
they will be called without setting up EBX needed for PLT which is
used by IFUNC. */
# define libc_hidden_builtin_def(name) \
.globl __GI_strlen; __GI_strlen = __strlen_ia32
#endif
#include "../../i586/strlen.S"

View File

@ -68,9 +68,9 @@ END(strspn)
# define END(name) \
cfi_endproc; .size __strspn_ia32, .-__strspn_ia32
# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal strspn calls through a PLT.
The speedup we get from using SSE4.2 instruction is likely eaten away
by the indirect call in the PLT. */
/* IFUNC doesn't work with the hidden functions in shared library since
they will be called without setting up EBX needed for PLT which is
used by IFUNC. */
# define libc_hidden_builtin_def(name) \
.globl __GI_strspn; __GI_strspn = __strspn_ia32
#endif