Optimized wcschr and wcscpy for x86-64 and x86-32

This commit is contained in:
Ulrich Drepper 2011-12-17 14:39:23 -05:00
parent a2d18b64ed
commit 1d3e4b618a
20 changed files with 2466 additions and 13 deletions

View File

@ -1,3 +1,28 @@
2011-11-14 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
* sysdeps/x86_64/multiarch/Makefile [subdir=wcsmbs] (sysdep_routines):
Add wcscpy-ssse3 wcscpy-c.
* sysdeps/x86_64/multiarch/wcscpy-ssse3.S: New file.
* sysdeps/x86_64/multiarch/wcscpy-c.c: New file.
* sysdeps/x86_64/multiarch/wcscpy.S: New file.
* sysdeps/x86_64/wcschr.S: New file.
* sysdeps/x86_64/wcsrchr.S: New file.
* string/test-strcmp.c: Remove checking of wcscmp function for
wrong alignments.
* sysdeps/i386/i686/multiarch/Makefile [subdir=wcsmbs]
(sysdep_routines): Add wcscpy-ssse3 wcscpy-c wcschr-sse2 wcschr-c
wcsrchr-sse2 wcsrchr-c.
* sysdeps/i386/i686/multiarch/wcschr.S: New file.
* sysdeps/i386/i686/multiarch/wcschr-c.c: New file.
* sysdeps/i386/i686/multiarch/wcschr-sse2.S: New file.
* sysdeps/i386/i686/multiarch/wcsrchr.S: New file.
* sysdeps/i386/i686/multiarch/wcsrchr-c.c: New file.
* sysdeps/i386/i686/multiarch/wcsrchr-sse2.S: New file.
* sysdeps/i386/i686/multiarch/wcscpy.S: New file.
* sysdeps/i386/i686/multiarch/wcscpy-c.c: New file.
* sysdeps/i386/i686/multiarch/wcscpy-ssse3.S: New file.
* wcsmbc/wcschr.c (WCSCHR): New macro.
2011-11-17 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
* wcsmbs/Makefile (strop-tests): Add wcsrchr wcscpy.

3
NEWS
View File

@ -34,7 +34,8 @@ Version 2.15
* Optimized strchr and strrchr for SSE on x86-32.
Contributed by Liubov Dmitrieva.
* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp for x86-64 and x86-32.
* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp, wcschr, wcscpy
for x86-64 and x86-32.
Contributed by Liubov Dmitrieva.
* New interfaces: scandirat, scandirat64

View File

@ -221,14 +221,14 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
static void
do_random_tests (void)
{
for (size_t a = 0; a < CHARBYTES; a += CHARALIGN)
for (size_t b = 0; b < CHARBYTES; b += CHARALIGN)
{
UCHAR *p1 = (UCHAR *) (buf1 + page_size - 512 * CHARBYTES - a);
UCHAR *p2 = (UCHAR *) (buf2 + page_size - 512 * CHARBYTES - b);
UCHAR *p1 = (UCHAR *) (buf1 + page_size - 512 * CHARBYTES);
UCHAR *p2 = (UCHAR *) (buf2 + page_size - 512 * CHARBYTES);
for (size_t n = 0; n < ITERATIONS; n++)
{
/* for wcscmp case align1 and align2 mean here alignment in wchar_t symbols, it
equal 4*k alignment in bytes, we don't check other alignments like for example p1 = (wchar_t *)(buf1 + 1)
because it's wrong using of wchar_t type. */
size_t align1 = random () & 31;
size_t align2;
if (random () & 1)
@ -274,7 +274,7 @@ do_random_tests (void)
}
int result = 0;
MEMCPY ((CHAR *) (p2 + align2), (CHAR *) (p1 + align1), pos);
MEMCPY (p2 + align2, p1 + align1, pos);
if (pos < len1)
{
if (p2[align2 + pos] == p1[align1 + pos])
@ -302,13 +302,12 @@ do_random_tests (void)
|| (r < 0 && result >= 0)
|| (r > 0 && result <= 0))
{
error (0, 0, "Iteration %zd - wrong result in function %s (%zd, %zd, %zd, %zd, %zd) %d != %d, p1 %p p2 %p",
error (0, 0, "Iteration %zd - wrong result in function %s (align in bytes: %zd, align in bytes: %zd, len1: %zd, len2: %zd, pos: %zd) %d != %d, p1 %p p2 %p",
n, impl->name, (size_t) (p1 + align1) & 63, (size_t) (p1 + align2) & 63, len1, len2, pos, r, result, p1, p2);
ret = 1;
}
}
}
}
}
}
static void

View File

@ -37,7 +37,8 @@ endif
ifeq ($(subdir),wcsmbs)
sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \
wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \
wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c
endif
ifeq (mathyes,$(subdir)$(config-cflags-avx))

View File

@ -0,0 +1,8 @@
#ifndef NOT_IN_libc
# undef libc_hidden_def
# define libc_hidden_def(name) \
__hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32);
# define WCSCHR __wcschr_ia32
#endif
#include "wcsmbs/wcschr.c"

View File

@ -0,0 +1,220 @@
/* wcschr with SSE2, without using bsf instructions
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#ifndef NOT_IN_libc
# include <sysdep.h>
# define CFI_PUSH(REG) \
cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0)
# define CFI_POP(REG) \
cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
# define PUSH(REG) pushl REG; CFI_PUSH (REG)
# define POP(REG) popl REG; CFI_POP (REG)
# define PARMS 4
# define STR1 PARMS
# define STR2 STR1+4
atom_text_section
ENTRY (__wcschr_sse2)
mov STR1(%esp), %ecx
movd STR2(%esp), %xmm1
mov %ecx, %eax
punpckldq %xmm1, %xmm1
pxor %xmm2, %xmm2
punpckldq %xmm1, %xmm1
and $63, %eax
cmp $48, %eax
ja L(cross_cache)
movdqu (%ecx), %xmm0
pcmpeqd %xmm0, %xmm2
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %edx
pmovmskb %xmm0, %eax
or %eax, %edx
jnz L(matches)
and $-16, %ecx
jmp L(loop)
.p2align 4
L(cross_cache):
PUSH (%edi)
mov %ecx, %edi
mov %eax, %ecx
and $-16, %edi
and $15, %ecx
movdqa (%edi), %xmm0
pcmpeqd %xmm0, %xmm2
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %edx
pmovmskb %xmm0, %eax
sarl %cl, %edx
sarl %cl, %eax
test %eax, %eax
jz L(unaligned_no_match)
add %edi, %ecx
POP (%edi)
test %edx, %edx
jz L(match_case1)
test %al, %al
jz L(match_higth_case2)
test $15, %al
jnz L(match_case2_4)
test $15, %dl
jnz L(return_null)
lea 4(%ecx), %eax
ret
CFI_PUSH (%edi)
.p2align 4
L(unaligned_no_match):
mov %edi, %ecx
POP (%edi)
test %edx, %edx
jnz L(return_null)
pxor %xmm2, %xmm2
/* Loop start on aligned string. */
.p2align 4
L(loop):
add $16, %ecx
movdqa (%ecx), %xmm0
pcmpeqd %xmm0, %xmm2
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %edx
pmovmskb %xmm0, %eax
or %eax, %edx
jnz L(matches)
add $16, %ecx
movdqa (%ecx), %xmm0
pcmpeqd %xmm0, %xmm2
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %edx
pmovmskb %xmm0, %eax
or %eax, %edx
jnz L(matches)
add $16, %ecx
movdqa (%ecx), %xmm0
pcmpeqd %xmm0, %xmm2
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %edx
pmovmskb %xmm0, %eax
or %eax, %edx
jnz L(matches)
add $16, %ecx
movdqa (%ecx), %xmm0
pcmpeqd %xmm0, %xmm2
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %edx
pmovmskb %xmm0, %eax
or %eax, %edx
jz L(loop)
.p2align 4
L(matches):
pmovmskb %xmm2, %edx
test %eax, %eax
jz L(return_null)
test %edx, %edx
jz L(match_case1)
.p2align 4
L(match_case2):
test %al, %al
jz L(match_higth_case2)
test $15, %al
jnz L(match_case2_4)
test $15, %dl
jnz L(return_null)
lea 4(%ecx), %eax
ret
.p2align 4
L(match_case2_4):
mov %ecx, %eax
ret
.p2align 4
L(match_higth_case2):
test %dl, %dl
jnz L(return_null)
test $15, %ah
jnz L(match_case2_12)
test $15, %dh
jnz L(return_null)
lea 12(%ecx), %eax
ret
.p2align 4
L(match_case2_12):
lea 8(%ecx), %eax
ret
.p2align 4
L(match_case1):
test %al, %al
jz L(match_higth_case1)
test $0x01, %al
jnz L(exit0)
lea 4(%ecx), %eax
ret
.p2align 4
L(match_higth_case1):
test $0x01, %ah
jnz L(exit3)
lea 12(%ecx), %eax
ret
.p2align 4
L(exit0):
mov %ecx, %eax
ret
.p2align 4
L(exit3):
lea 8(%ecx), %eax
ret
.p2align 4
L(return_null):
xor %eax, %eax
ret
END (__wcschr_sse2)
#endif

View File

@ -0,0 +1,54 @@
/* Multiple versions of wcschr
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
#ifndef NOT_IN_libc
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
.globl __i686.get_pc_thunk.bx
.hidden __i686.get_pc_thunk.bx
.p2align 4
.type __i686.get_pc_thunk.bx,@function
__i686.get_pc_thunk.bx:
movl (%esp), %ebx
ret
.text
ENTRY(wcschr)
.type wcschr, @gnu_indirect_function
pushl %ebx
cfi_adjust_cfa_offset (4)
cfi_rel_offset (ebx, 0)
call __i686.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
jne 1f
call __init_cpu_features
1: leal __wcschr_ia32@GOTOFF(%ebx), %eax
testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __wcschr_sse2@GOTOFF(%ebx), %eax
2: popl %ebx
cfi_adjust_cfa_offset (-4);
cfi_restore (ebx)
ret
END(wcschr)
#endif

View File

@ -0,0 +1,5 @@
#ifndef NOT_IN_libc
# define wcscpy __wcscpy_ia32
#endif
#include "wcsmbs/wcscpy.c"

View File

@ -0,0 +1,621 @@
/* wcscpy with SSSE3
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#ifndef NOT_IN_libc
# include <sysdep.h>
# define CFI_PUSH(REG) \
cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0)
# define CFI_POP(REG) \
cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
# define PUSH(REG) pushl REG; CFI_PUSH (REG)
# define POP(REG) popl REG; CFI_POP (REG)
# define PARMS 4
# define RETURN POP (%edi); ret; CFI_PUSH (%edi)
# define STR1 PARMS
# define STR2 STR1+4
# define LEN STR2+4
atom_text_section
ENTRY (__wcscpy_ssse3)
mov STR1(%esp), %edx
mov STR2(%esp), %ecx
cmp $0, (%ecx)
jz L(ExitTail4)
cmp $0, 4(%ecx)
jz L(ExitTail8)
cmp $0, 8(%ecx)
jz L(ExitTail12)
cmp $0, 12(%ecx)
jz L(ExitTail16)
PUSH (%edi)
mov %edx, %edi
PUSH (%esi)
lea 16(%ecx), %esi
and $-16, %esi
pxor %xmm0, %xmm0
pcmpeqd (%esi), %xmm0
movdqu (%ecx), %xmm1
movdqu %xmm1, (%edx)
pmovmskb %xmm0, %eax
sub %ecx, %esi
test %eax, %eax
jnz L(CopyFrom1To16Bytes)
mov %edx, %eax
lea 16(%edx), %edx
and $-16, %edx
sub %edx, %eax
sub %eax, %ecx
mov %ecx, %eax
and $0xf, %eax
mov $0, %esi
jz L(Align16Both)
cmp $4, %eax
je L(Shl4)
cmp $8, %eax
je L(Shl8)
jmp L(Shl12)
L(Align16Both):
movaps (%ecx), %xmm1
movaps 16(%ecx), %xmm2
movaps %xmm1, (%edx)
pcmpeqd %xmm2, %xmm0
pmovmskb %xmm0, %eax
lea 16(%esi), %esi
test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps 16(%ecx, %esi), %xmm3
movaps %xmm2, (%edx, %esi)
pcmpeqd %xmm3, %xmm0
pmovmskb %xmm0, %eax
lea 16(%esi), %esi
test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps 16(%ecx, %esi), %xmm4
movaps %xmm3, (%edx, %esi)
pcmpeqd %xmm4, %xmm0
pmovmskb %xmm0, %eax
lea 16(%esi), %esi
test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps 16(%ecx, %esi), %xmm1
movaps %xmm4, (%edx, %esi)
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm0, %eax
lea 16(%esi), %esi
test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps 16(%ecx, %esi), %xmm2
movaps %xmm1, (%edx, %esi)
pcmpeqd %xmm2, %xmm0
pmovmskb %xmm0, %eax
lea 16(%esi), %esi
test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps 16(%ecx, %esi), %xmm3
movaps %xmm2, (%edx, %esi)
pcmpeqd %xmm3, %xmm0
pmovmskb %xmm0, %eax
lea 16(%esi), %esi
test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps %xmm3, (%edx, %esi)
mov %ecx, %eax
lea 16(%ecx, %esi), %ecx
and $-0x40, %ecx
sub %ecx, %eax
sub %eax, %edx
mov $-0x40, %esi
L(Aligned64Loop):
movaps (%ecx), %xmm2
movaps 32(%ecx), %xmm3
movaps %xmm2, %xmm4
movaps 16(%ecx), %xmm5
movaps %xmm3, %xmm6
movaps 48(%ecx), %xmm7
pminub %xmm5, %xmm2
pminub %xmm7, %xmm3
pminub %xmm2, %xmm3
lea 64(%edx), %edx
pcmpeqd %xmm0, %xmm3
lea 64(%ecx), %ecx
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(Aligned64Leave)
movaps %xmm4, -64(%edx)
movaps %xmm5, -48(%edx)
movaps %xmm6, -32(%edx)
movaps %xmm7, -16(%edx)
jmp L(Aligned64Loop)
L(Aligned64Leave):
pcmpeqd %xmm4, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(CopyFrom1To16Bytes)
pcmpeqd %xmm5, %xmm0
pmovmskb %xmm0, %eax
movaps %xmm4, -64(%edx)
test %eax, %eax
lea 16(%esi), %esi
jnz L(CopyFrom1To16Bytes)
pcmpeqd %xmm6, %xmm0
pmovmskb %xmm0, %eax
movaps %xmm5, -48(%edx)
test %eax, %eax
lea 16(%esi), %esi
jnz L(CopyFrom1To16Bytes)
movaps %xmm6, -32(%edx)
pcmpeqd %xmm7, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
lea 16(%esi), %esi
jnz L(CopyFrom1To16Bytes)
mov $-0x40, %esi
movaps %xmm7, -16(%edx)
jmp L(Aligned64Loop)
.p2align 4
L(Shl4):
movaps -4(%ecx), %xmm1
movaps 12(%ecx), %xmm2
L(Shl4Start):
pcmpeqd %xmm2, %xmm0
pmovmskb %xmm0, %eax
movaps %xmm2, %xmm3
test %eax, %eax
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 28(%ecx), %xmm2
pcmpeqd %xmm2, %xmm0
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
movaps %xmm2, %xmm3
test %eax, %eax
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
movaps %xmm2, (%edx)
movaps 28(%ecx), %xmm2
movaps %xmm3, %xmm1
pcmpeqd %xmm2, %xmm0
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
movaps %xmm2, %xmm3
test %eax, %eax
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 28(%ecx), %xmm2
pcmpeqd %xmm2, %xmm0
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
movaps %xmm2, %xmm3
test %eax, %eax
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
lea 28(%ecx), %ecx
lea 16(%edx), %edx
mov %ecx, %eax
and $-0x40, %ecx
sub %ecx, %eax
lea -12(%ecx), %ecx
sub %eax, %edx
movaps -4(%ecx), %xmm1
L(Shl4LoopStart):
movaps 12(%ecx), %xmm2
movaps 28(%ecx), %xmm3
movaps %xmm3, %xmm6
movaps 44(%ecx), %xmm4
movaps %xmm4, %xmm7
movaps 60(%ecx), %xmm5
pminub %xmm2, %xmm6
pminub %xmm5, %xmm7
pminub %xmm6, %xmm7
pcmpeqd %xmm0, %xmm7
pmovmskb %xmm7, %eax
movaps %xmm5, %xmm7
palignr $4, %xmm4, %xmm5
test %eax, %eax
palignr $4, %xmm3, %xmm4
jnz L(Shl4Start)
palignr $4, %xmm2, %xmm3
lea 64(%ecx), %ecx
palignr $4, %xmm1, %xmm2
movaps %xmm7, %xmm1
movaps %xmm5, 48(%edx)
movaps %xmm4, 32(%edx)
movaps %xmm3, 16(%edx)
movaps %xmm2, (%edx)
lea 64(%edx), %edx
jmp L(Shl4LoopStart)
L(Shl4LoopExit):
movaps (%edx), %xmm6
psrldq $12, %xmm6
palignr $4, %xmm1, %xmm6
movaps %xmm6, (%edx)
add $12, %edx
add $12, %ecx
POP (%esi)
test %al, %al
jz L(ExitHigh)
test $0x01, %al
jnz L(Exit4)
movlpd (%ecx), %xmm0
movlpd %xmm0, (%edx)
movl %edi, %eax
RETURN
CFI_PUSH (%esi)
.p2align 4
L(Shl8):
movaps -8(%ecx), %xmm1
movaps 8(%ecx), %xmm2
L(Shl8Start):
pcmpeqd %xmm2, %xmm0
pmovmskb %xmm0, %eax
movaps %xmm2, %xmm3
test %eax, %eax
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 24(%ecx), %xmm2
pcmpeqd %xmm2, %xmm0
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
movaps %xmm2, %xmm3
test %eax, %eax
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
movaps %xmm2, (%edx)
movaps 24(%ecx), %xmm2
movaps %xmm3, %xmm1
pcmpeqd %xmm2, %xmm0
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
movaps %xmm2, %xmm3
test %eax, %eax
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 24(%ecx), %xmm2
pcmpeqd %xmm2, %xmm0
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
movaps %xmm2, %xmm3
test %eax, %eax
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
lea 24(%ecx), %ecx
lea 16(%edx), %edx
mov %ecx, %eax
and $-0x40, %ecx
sub %ecx, %eax
lea -8(%ecx), %ecx
sub %eax, %edx
movaps -8(%ecx), %xmm1
L(Shl8LoopStart):
movaps 8(%ecx), %xmm2
movaps 24(%ecx), %xmm3
movaps %xmm3, %xmm6
movaps 40(%ecx), %xmm4
movaps %xmm4, %xmm7
movaps 56(%ecx), %xmm5
pminub %xmm2, %xmm6
pminub %xmm5, %xmm7
pminub %xmm6, %xmm7
pcmpeqd %xmm0, %xmm7
pmovmskb %xmm7, %eax
movaps %xmm5, %xmm7
palignr $8, %xmm4, %xmm5
test %eax, %eax
palignr $8, %xmm3, %xmm4
jnz L(Shl8Start)
palignr $8, %xmm2, %xmm3
lea 64(%ecx), %ecx
palignr $8, %xmm1, %xmm2
movaps %xmm7, %xmm1
movaps %xmm5, 48(%edx)
movaps %xmm4, 32(%edx)
movaps %xmm3, 16(%edx)
movaps %xmm2, (%edx)
lea 64(%edx), %edx
jmp L(Shl8LoopStart)
L(Shl8LoopExit):
movaps (%edx), %xmm6
psrldq $8, %xmm6
palignr $8, %xmm1, %xmm6
movaps %xmm6, (%edx)
add $8, %edx
add $8, %ecx
POP (%esi)
test %al, %al
jz L(ExitHigh)
test $0x01, %al
jnz L(Exit4)
movlpd (%ecx), %xmm0
movlpd %xmm0, (%edx)
movl %edi, %eax
RETURN
CFI_PUSH (%esi)
.p2align 4
L(Shl12):
movaps -12(%ecx), %xmm1
movaps 4(%ecx), %xmm2
L(Shl12Start):
pcmpeqd %xmm2, %xmm0
pmovmskb %xmm0, %eax
movaps %xmm2, %xmm3
test %eax, %eax
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 20(%ecx), %xmm2
pcmpeqd %xmm2, %xmm0
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
movaps %xmm2, %xmm3
test %eax, %eax
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
movaps %xmm2, (%edx)
movaps 20(%ecx), %xmm2
movaps %xmm3, %xmm1
pcmpeqd %xmm2, %xmm0
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
movaps %xmm2, %xmm3
test %eax, %eax
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 20(%ecx), %xmm2
pcmpeqd %xmm2, %xmm0
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
movaps %xmm2, %xmm3
test %eax, %eax
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
lea 20(%ecx), %ecx
lea 16(%edx), %edx
mov %ecx, %eax
and $-0x40, %ecx
sub %ecx, %eax
lea -4(%ecx), %ecx
sub %eax, %edx
movaps -12(%ecx), %xmm1
L(Shl12LoopStart):
movaps 4(%ecx), %xmm2
movaps 20(%ecx), %xmm3
movaps %xmm3, %xmm6
movaps 36(%ecx), %xmm4
movaps %xmm4, %xmm7
movaps 52(%ecx), %xmm5
pminub %xmm2, %xmm6
pminub %xmm5, %xmm7
pminub %xmm6, %xmm7
pcmpeqd %xmm0, %xmm7
pmovmskb %xmm7, %eax
movaps %xmm5, %xmm7
palignr $12, %xmm4, %xmm5
test %eax, %eax
palignr $12, %xmm3, %xmm4
jnz L(Shl12Start)
palignr $12, %xmm2, %xmm3
lea 64(%ecx), %ecx
palignr $12, %xmm1, %xmm2
movaps %xmm7, %xmm1
movaps %xmm5, 48(%edx)
movaps %xmm4, 32(%edx)
movaps %xmm3, 16(%edx)
movaps %xmm2, (%edx)
lea 64(%edx), %edx
jmp L(Shl12LoopStart)
L(Shl12LoopExit):
movaps (%edx), %xmm6
psrldq $4, %xmm6
mov $4, %esi
palignr $12, %xmm1, %xmm6
movaps %xmm6, (%edx)
.p2align 4
L(CopyFrom1To16Bytes):
add %esi, %edx
add %esi, %ecx
POP (%esi)
test %al, %al
jz L(ExitHigh)
test $0x01, %al
jnz L(Exit4)
movlpd (%ecx), %xmm0
movlpd %xmm0, (%edx)
movl %edi, %eax
RETURN
.p2align 4
L(ExitHigh):
test $0x01, %ah
jnz L(Exit12)
movdqu (%ecx), %xmm0
movdqu %xmm0, (%edx)
movl %edi, %eax
RETURN
.p2align 4
L(Exit4):
movl (%ecx), %eax
movl %eax, (%edx)
movl %edi, %eax
RETURN
.p2align 4
L(Exit12):
movlpd (%ecx), %xmm0
movlpd %xmm0, (%edx)
movl 8(%ecx), %eax
movl %eax, 8(%edx)
movl %edi, %eax
RETURN
CFI_POP (%edi)
.p2align 4
L(ExitTail4):
movl (%ecx), %eax
movl %eax, (%edx)
movl %edx, %eax
ret
.p2align 4
L(ExitTail8):
movlpd (%ecx), %xmm0
movlpd %xmm0, (%edx)
movl %edx, %eax
ret
.p2align 4
L(ExitTail12):
movlpd (%ecx), %xmm0
movlpd %xmm0, (%edx)
movl 8(%ecx), %eax
movl %eax, 8(%edx)
movl %edx, %eax
ret
.p2align 4
L(ExitTail16):
movdqu (%ecx), %xmm0
movdqu %xmm0, (%edx)
movl %edx, %eax
ret
END (__wcscpy_ssse3)
#endif

View File

@ -0,0 +1,46 @@
/* Multiple versions of wcscpy
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
/* Define multiple versions only for the definition in libc. */
#ifndef NOT_IN_libc
.text
ENTRY(wcscpy)
.type wcscpy, @gnu_indirect_function
pushl %ebx
cfi_adjust_cfa_offset (4)
cfi_rel_offset (ebx, 0)
call __i686.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
jne 1f
call __init_cpu_features
1: leal __wcscpy_ia32@GOTOFF(%ebx), %eax
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __wcscpy_ssse3@GOTOFF(%ebx), %eax
2: popl %ebx
cfi_adjust_cfa_offset (-4)
cfi_restore (ebx)
ret
END(wcscpy)
#endif

View File

@ -0,0 +1,5 @@
#ifndef NOT_IN_libc
# define wcsrchr __wcsrchr_ia32
#endif
#include "wcsmbs/wcsrchr.c"

View File

@ -0,0 +1,355 @@
/* wcsrchr with SSE2, without using bsf instructions.
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#ifndef NOT_IN_libc
# include <sysdep.h>
# define CFI_PUSH(REG) \
cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0)
# define CFI_POP(REG) \
cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
# define PUSH(REG) pushl REG; CFI_PUSH (REG)
# define POP(REG) popl REG; CFI_POP (REG)
# define PARMS 8
# define ENTRANCE PUSH (%edi);
# define RETURN POP (%edi); ret; CFI_PUSH (%edi);
# define STR1 PARMS
# define STR2 STR1+4
atom_text_section
ENTRY (__wcsrchr_sse2)
ENTRANCE
mov STR1(%esp), %ecx
movd STR2(%esp), %xmm1
mov %ecx, %edi
punpckldq %xmm1, %xmm1
pxor %xmm2, %xmm2
punpckldq %xmm1, %xmm1
/* ECX has OFFSET. */
and $63, %ecx
cmp $48, %ecx
ja L(crosscache)
/* unaligned string. */
movdqu (%edi), %xmm0
pcmpeqd %xmm0, %xmm2
pcmpeqd %xmm1, %xmm0
/* Find where NULL is. */
pmovmskb %xmm2, %ecx
/* Check if there is a match. */
pmovmskb %xmm0, %eax
add $16, %edi
test %eax, %eax
jnz L(unaligned_match1)
test %ecx, %ecx
jnz L(return_null)
and $-16, %edi
PUSH (%esi)
xor %edx, %edx
jmp L(loop)
CFI_POP (%esi)
.p2align 4
L(unaligned_match1):
test %ecx, %ecx
jnz L(prolog_find_zero_1)
PUSH (%esi)
/* Save current match */
mov %eax, %edx
mov %edi, %esi
and $-16, %edi
jmp L(loop)
CFI_POP (%esi)
.p2align 4
L(crosscache):
/* Hancle unaligned string. */
and $15, %ecx
and $-16, %edi
pxor %xmm3, %xmm3
movdqa (%edi), %xmm0
pcmpeqd %xmm0, %xmm3
pcmpeqd %xmm1, %xmm0
/* Find where NULL is. */
pmovmskb %xmm3, %edx
/* Check if there is a match. */
pmovmskb %xmm0, %eax
/* Remove the leading bytes. */
shr %cl, %edx
shr %cl, %eax
add $16, %edi
test %eax, %eax
jnz L(unaligned_match)
test %edx, %edx
jnz L(return_null)
PUSH (%esi)
xor %edx, %edx
jmp L(loop)
CFI_POP (%esi)
.p2align 4
L(unaligned_match):
test %edx, %edx
jnz L(prolog_find_zero)
PUSH (%esi)
mov %eax, %edx
lea (%edi, %ecx), %esi
/* Loop start on aligned string. */
.p2align 4
L(loop):
movdqa (%edi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %edi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %ecx
pmovmskb %xmm0, %eax
or %eax, %ecx
jnz L(matches)
movdqa (%edi), %xmm3
pcmpeqd %xmm3, %xmm2
add $16, %edi
pcmpeqd %xmm1, %xmm3
pmovmskb %xmm2, %ecx
pmovmskb %xmm3, %eax
or %eax, %ecx
jnz L(matches)
movdqa (%edi), %xmm4
pcmpeqd %xmm4, %xmm2
add $16, %edi
pcmpeqd %xmm1, %xmm4
pmovmskb %xmm2, %ecx
pmovmskb %xmm4, %eax
or %eax, %ecx
jnz L(matches)
movdqa (%edi), %xmm5
pcmpeqd %xmm5, %xmm2
add $16, %edi
pcmpeqd %xmm1, %xmm5
pmovmskb %xmm2, %ecx
pmovmskb %xmm5, %eax
or %eax, %ecx
jz L(loop)
.p2align 4
L(matches):
test %eax, %eax
jnz L(match)
L(return_value):
test %edx, %edx
jz L(return_null_1)
mov %edx, %eax
mov %esi, %edi
POP (%esi)
test %ah, %ah
jnz L(match_third_or_fourth_wchar)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%edi), %eax
RETURN
CFI_PUSH (%esi)
.p2align 4
L(return_null_1):
POP (%esi)
xor %eax, %eax
RETURN
CFI_PUSH (%esi)
.p2align 4
L(match):
pmovmskb %xmm2, %ecx
test %ecx, %ecx
jnz L(find_zero)
/* save match info */
mov %eax, %edx
mov %edi, %esi
jmp L(loop)
.p2align 4
L(find_zero):
test %cl, %cl
jz L(find_zero_in_third_or_fourth_wchar)
test $15, %cl
jz L(find_zero_in_second_wchar)
and $1, %eax
jz L(return_value)
POP (%esi)
lea -16(%edi), %eax
RETURN
CFI_PUSH (%esi)
.p2align 4
L(find_zero_in_second_wchar):
and $1 << 5 - 1, %eax
jz L(return_value)
POP (%esi)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%edi), %eax
RETURN
CFI_PUSH (%esi)
.p2align 4
L(find_zero_in_third_or_fourth_wchar):
test $15, %ch
jz L(find_zero_in_fourth_wchar)
and $1 << 9 - 1, %eax
jz L(return_value)
POP (%esi)
test %ah, %ah
jnz L(match_third_wchar)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%edi), %eax
RETURN
CFI_PUSH (%esi)
.p2align 4
L(find_zero_in_fourth_wchar):
POP (%esi)
test %ah, %ah
jnz L(match_third_or_fourth_wchar)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%edi), %eax
RETURN
CFI_PUSH (%esi)
.p2align 4
L(match_second_wchar):
lea -12(%edi), %eax
RETURN
.p2align 4
L(match_third_or_fourth_wchar):
test $15 << 4, %ah
jnz L(match_fourth_wchar)
lea -8(%edi), %eax
RETURN
.p2align 4
L(match_third_wchar):
lea -8(%edi), %eax
RETURN
.p2align 4
L(match_fourth_wchar):
lea -4(%edi), %eax
RETURN
.p2align 4
L(return_null):
xor %eax, %eax
RETURN
.p2align 4
L(prolog_find_zero):
add %ecx, %edi
mov %edx, %ecx
L(prolog_find_zero_1):
test %cl, %cl
jz L(prolog_find_zero_in_third_or_fourth_wchar)
test $15, %cl
jz L(prolog_find_zero_in_second_wchar)
and $1, %eax
jz L(return_null)
lea -16(%edi), %eax
RETURN
.p2align 4
L(prolog_find_zero_in_second_wchar):
and $1 << 5 - 1, %eax
jz L(return_null)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%edi), %eax
RETURN
.p2align 4
L(prolog_find_zero_in_third_or_fourth_wchar):
test $15, %ch
jz L(prolog_find_zero_in_fourth_wchar)
and $1 << 9 - 1, %eax
jz L(return_null)
test %ah, %ah
jnz L(match_third_wchar)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%edi), %eax
RETURN
.p2align 4
L(prolog_find_zero_in_fourth_wchar):
test %ah, %ah
jnz L(match_third_or_fourth_wchar)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%edi), %eax
RETURN
END (__wcsrchr_sse2)
#endif

View File

@ -0,0 +1,54 @@
/* Multiple versions of wcsrchr
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
#ifndef NOT_IN_libc
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
.globl __i686.get_pc_thunk.bx
.hidden __i686.get_pc_thunk.bx
.p2align 4
.type __i686.get_pc_thunk.bx,@function
__i686.get_pc_thunk.bx:
movl (%esp), %ebx
ret
.text
ENTRY(wcsrchr)
.type wcsrchr, @gnu_indirect_function
pushl %ebx
cfi_adjust_cfa_offset (4)
cfi_rel_offset (ebx, 0)
call __i686.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
jne 1f
call __init_cpu_features
1: leal __wcsrchr_ia32@GOTOFF(%ebx), %eax
testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __wcsrchr_sse2@GOTOFF(%ebx), %eax
2: popl %ebx
cfi_adjust_cfa_offset (-4);
cfi_restore (ebx)
ret
END(wcsrchr)
#endif

View File

@ -16,7 +16,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strcat-sse2-unaligned strncat-sse2-unaligned \
strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
memcmp-ssse3
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
@ -28,3 +28,7 @@ CFLAGS-strcasestr.c += -msse4
CFLAGS-strcasestr-nonascii.c += -msse4
endif
endif
ifeq ($(subdir),wcsmbs)
sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
endif

View File

@ -0,0 +1,5 @@
#ifndef NOT_IN_libc
# define wcscpy __wcscpy_sse2
#endif
#include "wcsmbs/wcscpy.c"

View File

@ -0,0 +1,566 @@
/* wcscpy with SSSE3
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#ifndef NOT_IN_libc
# include <sysdep.h>
.text
ENTRY (__wcscpy_ssse3)
mov %rsi, %rcx
mov %rdi, %rdx
cmpl $0, (%rcx)
jz L(Exit4)
cmpl $0, 4(%rcx)
jz L(Exit8)
cmpl $0, 8(%rcx)
jz L(Exit12)
cmpl $0, 12(%rcx)
jz L(Exit16)
lea 16(%rcx), %rsi
and $-16, %rsi
pxor %xmm0, %xmm0
mov (%rcx), %r9
mov %r9, (%rdx)
pcmpeqd (%rsi), %xmm0
mov 8(%rcx), %r9
mov %r9, 8(%rdx)
pmovmskb %xmm0, %rax
sub %rcx, %rsi
test %rax, %rax
jnz L(CopyFrom1To16Bytes)
mov %rdx, %rax
lea 16(%rdx), %rdx
and $-16, %rdx
sub %rdx, %rax
sub %rax, %rcx
mov %rcx, %rax
and $0xf, %rax
mov $0, %rsi
/* case: rcx_offset == rdx_offset */
jz L(Align16Both)
cmp $4, %rax
je L(Shl4)
cmp $8, %rax
je L(Shl8)
jmp L(Shl12)
L(Align16Both):
movaps (%rcx), %xmm1
movaps 16(%rcx), %xmm2
movaps %xmm1, (%rdx)
pcmpeqd %xmm2, %xmm0
pmovmskb %xmm0, %rax
lea 16(%rsi), %rsi
test %rax, %rax
jnz L(CopyFrom1To16Bytes)
movaps 16(%rcx, %rsi), %xmm3
movaps %xmm2, (%rdx, %rsi)
pcmpeqd %xmm3, %xmm0
pmovmskb %xmm0, %rax
lea 16(%rsi), %rsi
test %rax, %rax
jnz L(CopyFrom1To16Bytes)
movaps 16(%rcx, %rsi), %xmm4
movaps %xmm3, (%rdx, %rsi)
pcmpeqd %xmm4, %xmm0
pmovmskb %xmm0, %rax
lea 16(%rsi), %rsi
test %rax, %rax
jnz L(CopyFrom1To16Bytes)
movaps 16(%rcx, %rsi), %xmm1
movaps %xmm4, (%rdx, %rsi)
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm0, %rax
lea 16(%rsi), %rsi
test %rax, %rax
jnz L(CopyFrom1To16Bytes)
movaps 16(%rcx, %rsi), %xmm2
movaps %xmm1, (%rdx, %rsi)
pcmpeqd %xmm2, %xmm0
pmovmskb %xmm0, %rax
lea 16(%rsi), %rsi
test %rax, %rax
jnz L(CopyFrom1To16Bytes)
movaps 16(%rcx, %rsi), %xmm3
movaps %xmm2, (%rdx, %rsi)
pcmpeqd %xmm3, %xmm0
pmovmskb %xmm0, %rax
lea 16(%rsi), %rsi
test %rax, %rax
jnz L(CopyFrom1To16Bytes)
movaps %xmm3, (%rdx, %rsi)
mov %rcx, %rax
lea 16(%rcx, %rsi), %rcx
and $-0x40, %rcx
sub %rcx, %rax
sub %rax, %rdx
mov $-0x40, %rsi
L(Aligned64Loop):
movaps (%rcx), %xmm2
movaps %xmm2, %xmm4
movaps 16(%rcx), %xmm5
movaps 32(%rcx), %xmm3
movaps %xmm3, %xmm6
movaps 48(%rcx), %xmm7
pminub %xmm5, %xmm2
pminub %xmm7, %xmm3
pminub %xmm2, %xmm3
pcmpeqd %xmm0, %xmm3
pmovmskb %xmm3, %rax
lea 64(%rdx), %rdx
lea 64(%rcx), %rcx
test %rax, %rax
jnz L(Aligned64Leave)
movaps %xmm4, -64(%rdx)
movaps %xmm5, -48(%rdx)
movaps %xmm6, -32(%rdx)
movaps %xmm7, -16(%rdx)
jmp L(Aligned64Loop)
L(Aligned64Leave):
pcmpeqd %xmm4, %xmm0
pmovmskb %xmm0, %rax
test %rax, %rax
jnz L(CopyFrom1To16Bytes)
pcmpeqd %xmm5, %xmm0
pmovmskb %xmm0, %rax
movaps %xmm4, -64(%rdx)
test %rax, %rax
lea 16(%rsi), %rsi
jnz L(CopyFrom1To16Bytes)
pcmpeqd %xmm6, %xmm0
pmovmskb %xmm0, %rax
movaps %xmm5, -48(%rdx)
test %rax, %rax
lea 16(%rsi), %rsi
jnz L(CopyFrom1To16Bytes)
movaps %xmm6, -32(%rdx)
pcmpeqd %xmm7, %xmm0
pmovmskb %xmm0, %rax
lea 16(%rsi), %rsi
test %rax, %rax
jnz L(CopyFrom1To16Bytes)
mov $-0x40, %rsi
movaps %xmm7, -16(%rdx)
jmp L(Aligned64Loop)
.p2align 4
L(Shl4):
movaps -4(%rcx), %xmm1
movaps 12(%rcx), %xmm2
L(Shl4Start):
pcmpeqd %xmm2, %xmm0
pmovmskb %xmm0, %rax
movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
movaps %xmm3, %xmm1
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
test %rax, %rax
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
lea 28(%rcx), %rcx
lea 16(%rdx), %rdx
mov %rcx, %rax
and $-0x40, %rcx
sub %rcx, %rax
lea -12(%rcx), %rcx
sub %rax, %rdx
movaps -4(%rcx), %xmm1
L(Shl4LoopStart):
movaps 12(%rcx), %xmm2
movaps 28(%rcx), %xmm3
movaps %xmm3, %xmm6
movaps 44(%rcx), %xmm4
movaps %xmm4, %xmm7
movaps 60(%rcx), %xmm5
pminub %xmm2, %xmm6
pminub %xmm5, %xmm7
pminub %xmm6, %xmm7
pcmpeqd %xmm0, %xmm7
pmovmskb %xmm7, %rax
movaps %xmm5, %xmm7
palignr $4, %xmm4, %xmm5
test %rax, %rax
palignr $4, %xmm3, %xmm4
jnz L(Shl4Start)
palignr $4, %xmm2, %xmm3
lea 64(%rcx), %rcx
palignr $4, %xmm1, %xmm2
movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
movaps %xmm4, 32(%rdx)
movaps %xmm3, 16(%rdx)
movaps %xmm2, (%rdx)
lea 64(%rdx), %rdx
jmp L(Shl4LoopStart)
L(Shl4LoopExit):
movaps (%rdx), %xmm6
psrldq $12, %xmm6
mov $12, %rsi
palignr $4, %xmm1, %xmm6
movaps %xmm6, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
L(Shl8):
movaps -8(%rcx), %xmm1
movaps 8(%rcx), %xmm2
L(Shl8Start):
pcmpeqd %xmm2, %xmm0
pmovmskb %xmm0, %rax
movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
movaps %xmm3, %xmm1
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
lea 24(%rcx), %rcx
lea 16(%rdx), %rdx
mov %rcx, %rax
and $-0x40, %rcx
sub %rcx, %rax
lea -8(%rcx), %rcx
sub %rax, %rdx
movaps -8(%rcx), %xmm1
L(Shl8LoopStart):
movaps 8(%rcx), %xmm2
movaps 24(%rcx), %xmm3
movaps %xmm3, %xmm6
movaps 40(%rcx), %xmm4
movaps %xmm4, %xmm7
movaps 56(%rcx), %xmm5
pminub %xmm2, %xmm6
pminub %xmm5, %xmm7
pminub %xmm6, %xmm7
pcmpeqd %xmm0, %xmm7
pmovmskb %xmm7, %rax
movaps %xmm5, %xmm7
palignr $8, %xmm4, %xmm5
test %rax, %rax
palignr $8, %xmm3, %xmm4
jnz L(Shl8Start)
palignr $8, %xmm2, %xmm3
lea 64(%rcx), %rcx
palignr $8, %xmm1, %xmm2
movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
movaps %xmm4, 32(%rdx)
movaps %xmm3, 16(%rdx)
movaps %xmm2, (%rdx)
lea 64(%rdx), %rdx
jmp L(Shl8LoopStart)
L(Shl8LoopExit):
movaps (%rdx), %xmm6
psrldq $8, %xmm6
mov $8, %rsi
palignr $8, %xmm1, %xmm6
movaps %xmm6, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
L(Shl12):
movaps -12(%rcx), %xmm1
movaps 4(%rcx), %xmm2
L(Shl12Start):
pcmpeqd %xmm2, %xmm0
pmovmskb %xmm0, %rax
movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
movaps %xmm3, %xmm1
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
lea 20(%rcx), %rcx
lea 16(%rdx), %rdx
mov %rcx, %rax
and $-0x40, %rcx
sub %rcx, %rax
lea -4(%rcx), %rcx
sub %rax, %rdx
movaps -12(%rcx), %xmm1
L(Shl12LoopStart):
movaps 4(%rcx), %xmm2
movaps 20(%rcx), %xmm3
movaps %xmm3, %xmm6
movaps 36(%rcx), %xmm4
movaps %xmm4, %xmm7
movaps 52(%rcx), %xmm5
pminub %xmm2, %xmm6
pminub %xmm5, %xmm7
pminub %xmm6, %xmm7
pcmpeqd %xmm0, %xmm7
pmovmskb %xmm7, %rax
movaps %xmm5, %xmm7
palignr $12, %xmm4, %xmm5
test %rax, %rax
palignr $12, %xmm3, %xmm4
jnz L(Shl12Start)
palignr $12, %xmm2, %xmm3
lea 64(%rcx), %rcx
palignr $12, %xmm1, %xmm2
movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
movaps %xmm4, 32(%rdx)
movaps %xmm3, 16(%rdx)
movaps %xmm2, (%rdx)
lea 64(%rdx), %rdx
jmp L(Shl12LoopStart)
L(Shl12LoopExit):
movaps (%rdx), %xmm6
psrldq $4, %xmm6
mov $4, %rsi
palignr $12, %xmm1, %xmm6
movaps %xmm6, (%rdx)
.p2align 4
L(CopyFrom1To16Bytes):
add %rsi, %rdx
add %rsi, %rcx
test %al, %al
jz L(ExitHigh)
test $0x01, %al
jnz L(Exit4)
mov (%rcx), %rax
mov %rax, (%rdx)
mov %rdi, %rax
ret
.p2align 4
L(ExitHigh):
test $0x01, %ah
jnz L(Exit12)
mov (%rcx), %rax
mov %rax, (%rdx)
mov 8(%rcx), %rax
mov %rax, 8(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(Exit4):
movl (%rcx), %eax
movl %eax, (%rdx)
mov %rdi, %rax
ret
.p2align 4
L(Exit8):
mov (%rcx), %rax
mov %rax, (%rdx)
mov %rdi, %rax
ret
.p2align 4
L(Exit12):
mov (%rcx), %rax
mov %rax, (%rdx)
mov 8(%rcx), %eax
mov %eax, 8(%rdx)
mov %rdi, %rax
ret
.p2align 4
L(Exit16):
movdqu (%rcx), %xmm0
movdqu %xmm0, (%rdx)
mov %rdi, %rax
ret
END(__wcscpy_ssse3)
#endif

View File

@ -0,0 +1,43 @@
/* Multiple versions of wcscpy
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
/* Define multiple versions only for the definition in libc. */
#ifndef NOT_IN_libc
.text
ENTRY(wcscpy)
.type wcscpy, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jnz 2f
leaq __wcscpy_sse2(%rip), %rax
ret
2: leaq __wcscpy_ssse3(%rip), %rax
ret
END(wcscpy)
#endif

155
sysdeps/x86_64/wcschr.S Normal file
View File

@ -0,0 +1,155 @@
/* wcschr with SSSE3
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
.text
ENTRY (wcschr)
movd %rsi, %xmm1
pxor %xmm2, %xmm2
mov %rdi, %rcx
punpckldq %xmm1, %xmm1
punpckldq %xmm1, %xmm1
and $63, %rcx
cmp $48, %rcx
ja L(cross_cache)
movdqu (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
and $-16, %rdi
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
jmp L(loop)
L(cross_cache):
and $15, %rcx
and $-16, %rdi
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
sar %cl, %rdx
sar %cl, %rax
test %rax, %rax
je L(unaligned_no_match)
bsf %rax, %rax
test %rdx, %rdx
je L(unaligned_match)
bsf %rdx, %rdx
cmp %rdx, %rax
ja L(return_null)
L(unaligned_match):
add %rdi, %rax
add %rcx, %rax
ret
.p2align 4
L(unaligned_no_match):
test %rdx, %rdx
jne L(return_null)
pxor %xmm2, %xmm2
add $16, %rdi
.p2align 4
/* Loop start on aligned string. */
L(loop):
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
jmp L(loop)
.p2align 4
L(matches):
pmovmskb %xmm2, %rdx
test %rax, %rax
jz L(return_null)
bsf %rax, %rax
test %rdx, %rdx
je L(match)
bsf %rdx, %rcx
cmp %rcx, %rax
ja L(return_null)
L(match):
sub $16, %rdi
add %rdi, %rax
ret
.p2align 4
L(return_null):
xor %rax, %rax
ret
END (wcschr)
libc_hidden_def(wcschr)

283
sysdeps/x86_64/wcsrchr.S Normal file
View File

@ -0,0 +1,283 @@
/* wcsrchr with SSSE3
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
.text
ENTRY (wcsrchr)
movd %rsi, %xmm1
mov %rdi, %rcx
punpckldq %xmm1, %xmm1
pxor %xmm2, %xmm2
punpckldq %xmm1, %xmm1
and $63, %rcx
cmp $48, %rcx
ja L(crosscache)
movdqu (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rcx
pmovmskb %xmm0, %rax
add $16, %rdi
test %rax, %rax
jnz L(unaligned_match1)
test %rcx, %rcx
jnz L(return_null)
and $-16, %rdi
xor %r8, %r8
jmp L(loop)
.p2align 4
L(unaligned_match1):
test %rcx, %rcx
jnz L(prolog_find_zero_1)
mov %rax, %r8
mov %rdi, %rsi
and $-16, %rdi
jmp L(loop)
.p2align 4
L(crosscache):
and $15, %rcx
and $-16, %rdi
pxor %xmm3, %xmm3
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm3
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm3, %rdx
pmovmskb %xmm0, %rax
shr %cl, %rdx
shr %cl, %rax
add $16, %rdi
test %rax, %rax
jnz L(unaligned_match)
test %rdx, %rdx
jnz L(return_null)
xor %r8, %r8
jmp L(loop)
.p2align 4
L(unaligned_match):
test %rdx, %rdx
jnz L(prolog_find_zero)
mov %rax, %r8
lea (%rdi, %rcx), %rsi
/* Loop start on aligned string. */
.p2align 4
L(loop):
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rcx
pmovmskb %xmm0, %rax
or %rax, %rcx
jnz L(matches)
movdqa (%rdi), %xmm3
pcmpeqd %xmm3, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm3
pmovmskb %xmm2, %rcx
pmovmskb %xmm3, %rax
or %rax, %rcx
jnz L(matches)
movdqa (%rdi), %xmm4
pcmpeqd %xmm4, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm4
pmovmskb %xmm2, %rcx
pmovmskb %xmm4, %rax
or %rax, %rcx
jnz L(matches)
movdqa (%rdi), %xmm5
pcmpeqd %xmm5, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm5
pmovmskb %xmm2, %rcx
pmovmskb %xmm5, %rax
or %rax, %rcx
jz L(loop)
.p2align 4
L(matches):
test %rax, %rax
jnz L(match)
L(return_value):
test %r8, %r8
jz L(return_null)
mov %r8, %rax
mov %rsi, %rdi
test $15 << 4, %ah
jnz L(match_fourth_wchar)
test %ah, %ah
jnz L(match_third_wchar)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%rdi), %rax
ret
.p2align 4
L(match):
pmovmskb %xmm2, %rcx
test %rcx, %rcx
jnz L(find_zero)
mov %rax, %r8
mov %rdi, %rsi
jmp L(loop)
.p2align 4
L(find_zero):
test $15, %cl
jnz L(find_zero_in_first_wchar)
test %cl, %cl
jnz L(find_zero_in_second_wchar)
test $15, %ch
jnz L(find_zero_in_third_wchar)
and $1 << 13 - 1, %rax
jz L(return_value)
test $15 << 4, %ah
jnz L(match_fourth_wchar)
test %ah, %ah
jnz L(match_third_wchar)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%rdi), %rax
ret
.p2align 4
L(find_zero_in_first_wchar):
test $1, %rax
jz L(return_value)
lea -16(%rdi), %rax
ret
.p2align 4
L(find_zero_in_second_wchar):
and $1 << 5 - 1, %rax
jz L(return_value)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%rdi), %rax
ret
.p2align 4
L(find_zero_in_third_wchar):
and $1 << 9 - 1, %rax
jz L(return_value)
test %ah, %ah
jnz L(match_third_wchar)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%rdi), %rax
ret
.p2align 4
L(prolog_find_zero):
add %rcx, %rdi
mov %rdx, %rcx
L(prolog_find_zero_1):
test $15, %cl
jnz L(prolog_find_zero_in_first_wchar)
test %cl, %cl
jnz L(prolog_find_zero_in_second_wchar)
test $15, %ch
jnz L(prolog_find_zero_in_third_wchar)
and $1 << 13 - 1, %rax
jz L(return_null)
test $15 << 4, %ah
jnz L(match_fourth_wchar)
test %ah, %ah
jnz L(match_third_wchar)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%rdi), %rax
ret
.p2align 4
L(prolog_find_zero_in_first_wchar):
test $1, %rax
jz L(return_null)
lea -16(%rdi), %rax
ret
.p2align 4
L(prolog_find_zero_in_second_wchar):
and $1 << 5 - 1, %rax
jz L(return_null)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%rdi), %rax
ret
.p2align 4
L(prolog_find_zero_in_third_wchar):
and $1 << 9 - 1, %rax
jz L(return_null)
test %ah, %ah
jnz L(match_third_wchar)
test $15 << 4, %al
jnz L(match_second_wchar)
lea -16(%rdi), %rax
ret
.p2align 4
L(match_second_wchar):
lea -12(%rdi), %rax
ret
.p2align 4
L(match_third_wchar):
lea -8(%rdi), %rax
ret
.p2align 4
L(match_fourth_wchar):
lea -4(%rdi), %rax
ret
.p2align 4
L(return_null):
xor %rax, %rax
ret
END (wcsrchr)

View File

@ -18,8 +18,11 @@
#include <wchar.h>
/* Find the first occurrence of WC in WCS. */
#ifdef WCSCHR
# define wcschr WCSCHR
#endif
wchar_t *
wcschr (wcs, wc)
register const wchar_t *wcs;