mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-21 20:40:05 +00:00
Optimized wcschr and wcscpy for x86-64 and x86-32
This commit is contained in:
parent
a2d18b64ed
commit
1d3e4b618a
25
ChangeLog
25
ChangeLog
@ -1,3 +1,28 @@
|
||||
2011-11-14 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
|
||||
|
||||
* sysdeps/x86_64/multiarch/Makefile [subdir=wcsmbs] (sysdep_routines):
|
||||
Add wcscpy-ssse3 wcscpy-c.
|
||||
* sysdeps/x86_64/multiarch/wcscpy-ssse3.S: New file.
|
||||
* sysdeps/x86_64/multiarch/wcscpy-c.c: New file.
|
||||
* sysdeps/x86_64/multiarch/wcscpy.S: New file.
|
||||
* sysdeps/x86_64/wcschr.S: New file.
|
||||
* sysdeps/x86_64/wcsrchr.S: New file.
|
||||
* string/test-strcmp.c: Remove checking of wcscmp function for
|
||||
wrong alignments.
|
||||
* sysdeps/i386/i686/multiarch/Makefile [subdir=wcsmbs]
|
||||
(sysdep_routines): Add wcscpy-ssse3 wcscpy-c wcschr-sse2 wcschr-c
|
||||
wcsrchr-sse2 wcsrchr-c.
|
||||
* sysdeps/i386/i686/multiarch/wcschr.S: New file.
|
||||
* sysdeps/i386/i686/multiarch/wcschr-c.c: New file.
|
||||
* sysdeps/i386/i686/multiarch/wcschr-sse2.S: New file.
|
||||
* sysdeps/i386/i686/multiarch/wcsrchr.S: New file.
|
||||
* sysdeps/i386/i686/multiarch/wcsrchr-c.c: New file.
|
||||
* sysdeps/i386/i686/multiarch/wcsrchr-sse2.S: New file.
|
||||
* sysdeps/i386/i686/multiarch/wcscpy.S: New file.
|
||||
* sysdeps/i386/i686/multiarch/wcscpy-c.c: New file.
|
||||
* sysdeps/i386/i686/multiarch/wcscpy-ssse3.S: New file.
|
||||
* wcsmbc/wcschr.c (WCSCHR): New macro.
|
||||
|
||||
2011-11-17 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
|
||||
|
||||
* wcsmbs/Makefile (strop-tests): Add wcsrchr wcscpy.
|
||||
|
3
NEWS
3
NEWS
@ -34,7 +34,8 @@ Version 2.15
|
||||
* Optimized strchr and strrchr for SSE on x86-32.
|
||||
Contributed by Liubov Dmitrieva.
|
||||
|
||||
* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp for x86-64 and x86-32.
|
||||
* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp, wcschr, wcscpy
|
||||
for x86-64 and x86-32.
|
||||
Contributed by Liubov Dmitrieva.
|
||||
|
||||
* New interfaces: scandirat, scandirat64
|
||||
|
@ -221,14 +221,14 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
|
||||
static void
|
||||
do_random_tests (void)
|
||||
{
|
||||
for (size_t a = 0; a < CHARBYTES; a += CHARALIGN)
|
||||
for (size_t b = 0; b < CHARBYTES; b += CHARALIGN)
|
||||
{
|
||||
UCHAR *p1 = (UCHAR *) (buf1 + page_size - 512 * CHARBYTES - a);
|
||||
UCHAR *p2 = (UCHAR *) (buf2 + page_size - 512 * CHARBYTES - b);
|
||||
UCHAR *p1 = (UCHAR *) (buf1 + page_size - 512 * CHARBYTES);
|
||||
UCHAR *p2 = (UCHAR *) (buf2 + page_size - 512 * CHARBYTES);
|
||||
|
||||
for (size_t n = 0; n < ITERATIONS; n++)
|
||||
{
|
||||
/* for wcscmp case align1 and align2 mean here alignment in wchar_t symbols, it
|
||||
equal 4*k alignment in bytes, we don't check other alignments like for example p1 = (wchar_t *)(buf1 + 1)
|
||||
because it's wrong using of wchar_t type. */
|
||||
size_t align1 = random () & 31;
|
||||
size_t align2;
|
||||
if (random () & 1)
|
||||
@ -274,7 +274,7 @@ do_random_tests (void)
|
||||
}
|
||||
|
||||
int result = 0;
|
||||
MEMCPY ((CHAR *) (p2 + align2), (CHAR *) (p1 + align1), pos);
|
||||
MEMCPY (p2 + align2, p1 + align1, pos);
|
||||
if (pos < len1)
|
||||
{
|
||||
if (p2[align2 + pos] == p1[align1 + pos])
|
||||
@ -302,13 +302,12 @@ do_random_tests (void)
|
||||
|| (r < 0 && result >= 0)
|
||||
|| (r > 0 && result <= 0))
|
||||
{
|
||||
error (0, 0, "Iteration %zd - wrong result in function %s (%zd, %zd, %zd, %zd, %zd) %d != %d, p1 %p p2 %p",
|
||||
error (0, 0, "Iteration %zd - wrong result in function %s (align in bytes: %zd, align in bytes: %zd, len1: %zd, len2: %zd, pos: %zd) %d != %d, p1 %p p2 %p",
|
||||
n, impl->name, (size_t) (p1 + align1) & 63, (size_t) (p1 + align2) & 63, len1, len2, pos, r, result, p1, p2);
|
||||
ret = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -37,7 +37,8 @@ endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \
|
||||
wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
|
||||
wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \
|
||||
wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c
|
||||
endif
|
||||
|
||||
ifeq (mathyes,$(subdir)$(config-cflags-avx))
|
||||
|
8
sysdeps/i386/i686/multiarch/wcschr-c.c
Normal file
8
sysdeps/i386/i686/multiarch/wcschr-c.c
Normal file
@ -0,0 +1,8 @@
|
||||
#ifndef NOT_IN_libc
|
||||
# undef libc_hidden_def
|
||||
# define libc_hidden_def(name) \
|
||||
__hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32);
|
||||
# define WCSCHR __wcschr_ia32
|
||||
#endif
|
||||
|
||||
#include "wcsmbs/wcschr.c"
|
220
sysdeps/i386/i686/multiarch/wcschr-sse2.S
Normal file
220
sysdeps/i386/i686/multiarch/wcschr-sse2.S
Normal file
@ -0,0 +1,220 @@
|
||||
/* wcschr with SSE2, without using bsf instructions
|
||||
Copyright (C) 2011 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#ifndef NOT_IN_libc
|
||||
# include <sysdep.h>
|
||||
|
||||
# define CFI_PUSH(REG) \
|
||||
cfi_adjust_cfa_offset (4); \
|
||||
cfi_rel_offset (REG, 0)
|
||||
|
||||
# define CFI_POP(REG) \
|
||||
cfi_adjust_cfa_offset (-4); \
|
||||
cfi_restore (REG)
|
||||
|
||||
# define PUSH(REG) pushl REG; CFI_PUSH (REG)
|
||||
# define POP(REG) popl REG; CFI_POP (REG)
|
||||
|
||||
# define PARMS 4
|
||||
# define STR1 PARMS
|
||||
# define STR2 STR1+4
|
||||
|
||||
atom_text_section
|
||||
ENTRY (__wcschr_sse2)
|
||||
|
||||
mov STR1(%esp), %ecx
|
||||
movd STR2(%esp), %xmm1
|
||||
|
||||
mov %ecx, %eax
|
||||
punpckldq %xmm1, %xmm1
|
||||
pxor %xmm2, %xmm2
|
||||
punpckldq %xmm1, %xmm1
|
||||
|
||||
and $63, %eax
|
||||
cmp $48, %eax
|
||||
ja L(cross_cache)
|
||||
|
||||
movdqu (%ecx), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
or %eax, %edx
|
||||
jnz L(matches)
|
||||
and $-16, %ecx
|
||||
jmp L(loop)
|
||||
|
||||
.p2align 4
|
||||
L(cross_cache):
|
||||
PUSH (%edi)
|
||||
mov %ecx, %edi
|
||||
mov %eax, %ecx
|
||||
and $-16, %edi
|
||||
and $15, %ecx
|
||||
movdqa (%edi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
|
||||
sarl %cl, %edx
|
||||
sarl %cl, %eax
|
||||
test %eax, %eax
|
||||
jz L(unaligned_no_match)
|
||||
|
||||
add %edi, %ecx
|
||||
POP (%edi)
|
||||
|
||||
test %edx, %edx
|
||||
jz L(match_case1)
|
||||
test %al, %al
|
||||
jz L(match_higth_case2)
|
||||
test $15, %al
|
||||
jnz L(match_case2_4)
|
||||
test $15, %dl
|
||||
jnz L(return_null)
|
||||
lea 4(%ecx), %eax
|
||||
ret
|
||||
|
||||
CFI_PUSH (%edi)
|
||||
|
||||
.p2align 4
|
||||
L(unaligned_no_match):
|
||||
mov %edi, %ecx
|
||||
POP (%edi)
|
||||
|
||||
test %edx, %edx
|
||||
jnz L(return_null)
|
||||
|
||||
pxor %xmm2, %xmm2
|
||||
|
||||
/* Loop start on aligned string. */
|
||||
.p2align 4
|
||||
L(loop):
|
||||
add $16, %ecx
|
||||
movdqa (%ecx), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
or %eax, %edx
|
||||
jnz L(matches)
|
||||
add $16, %ecx
|
||||
|
||||
movdqa (%ecx), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
or %eax, %edx
|
||||
jnz L(matches)
|
||||
add $16, %ecx
|
||||
|
||||
movdqa (%ecx), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
or %eax, %edx
|
||||
jnz L(matches)
|
||||
add $16, %ecx
|
||||
|
||||
movdqa (%ecx), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
or %eax, %edx
|
||||
jz L(loop)
|
||||
|
||||
.p2align 4
|
||||
L(matches):
|
||||
pmovmskb %xmm2, %edx
|
||||
test %eax, %eax
|
||||
jz L(return_null)
|
||||
test %edx, %edx
|
||||
jz L(match_case1)
|
||||
|
||||
.p2align 4
|
||||
L(match_case2):
|
||||
test %al, %al
|
||||
jz L(match_higth_case2)
|
||||
test $15, %al
|
||||
jnz L(match_case2_4)
|
||||
test $15, %dl
|
||||
jnz L(return_null)
|
||||
lea 4(%ecx), %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(match_case2_4):
|
||||
mov %ecx, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(match_higth_case2):
|
||||
test %dl, %dl
|
||||
jnz L(return_null)
|
||||
test $15, %ah
|
||||
jnz L(match_case2_12)
|
||||
test $15, %dh
|
||||
jnz L(return_null)
|
||||
lea 12(%ecx), %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(match_case2_12):
|
||||
lea 8(%ecx), %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(match_case1):
|
||||
test %al, %al
|
||||
jz L(match_higth_case1)
|
||||
|
||||
test $0x01, %al
|
||||
jnz L(exit0)
|
||||
lea 4(%ecx), %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(match_higth_case1):
|
||||
test $0x01, %ah
|
||||
jnz L(exit3)
|
||||
lea 12(%ecx), %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(exit0):
|
||||
mov %ecx, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(exit3):
|
||||
lea 8(%ecx), %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(return_null):
|
||||
xor %eax, %eax
|
||||
ret
|
||||
|
||||
END (__wcschr_sse2)
|
||||
#endif
|
54
sysdeps/i386/i686/multiarch/wcschr.S
Normal file
54
sysdeps/i386/i686/multiarch/wcschr.S
Normal file
@ -0,0 +1,54 @@
|
||||
/* Multiple versions of wcschr
|
||||
Copyright (C) 2011 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include <init-arch.h>
|
||||
|
||||
#ifndef NOT_IN_libc
|
||||
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
|
||||
.globl __i686.get_pc_thunk.bx
|
||||
.hidden __i686.get_pc_thunk.bx
|
||||
.p2align 4
|
||||
.type __i686.get_pc_thunk.bx,@function
|
||||
__i686.get_pc_thunk.bx:
|
||||
movl (%esp), %ebx
|
||||
ret
|
||||
|
||||
.text
|
||||
ENTRY(wcschr)
|
||||
.type wcschr, @gnu_indirect_function
|
||||
pushl %ebx
|
||||
cfi_adjust_cfa_offset (4)
|
||||
cfi_rel_offset (ebx, 0)
|
||||
call __i686.get_pc_thunk.bx
|
||||
addl $_GLOBAL_OFFSET_TABLE_, %ebx
|
||||
cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
|
||||
jne 1f
|
||||
call __init_cpu_features
|
||||
1: leal __wcschr_ia32@GOTOFF(%ebx), %eax
|
||||
testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
|
||||
jz 2f
|
||||
leal __wcschr_sse2@GOTOFF(%ebx), %eax
|
||||
2: popl %ebx
|
||||
cfi_adjust_cfa_offset (-4);
|
||||
cfi_restore (ebx)
|
||||
ret
|
||||
END(wcschr)
|
||||
#endif
|
5
sysdeps/i386/i686/multiarch/wcscpy-c.c
Normal file
5
sysdeps/i386/i686/multiarch/wcscpy-c.c
Normal file
@ -0,0 +1,5 @@
|
||||
#ifndef NOT_IN_libc
|
||||
# define wcscpy __wcscpy_ia32
|
||||
#endif
|
||||
|
||||
#include "wcsmbs/wcscpy.c"
|
621
sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
Normal file
621
sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
Normal file
@ -0,0 +1,621 @@
|
||||
/* wcscpy with SSSE3
|
||||
Copyright (C) 2011 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#ifndef NOT_IN_libc
|
||||
# include <sysdep.h>
|
||||
|
||||
# define CFI_PUSH(REG) \
|
||||
cfi_adjust_cfa_offset (4); \
|
||||
cfi_rel_offset (REG, 0)
|
||||
|
||||
# define CFI_POP(REG) \
|
||||
cfi_adjust_cfa_offset (-4); \
|
||||
cfi_restore (REG)
|
||||
|
||||
# define PUSH(REG) pushl REG; CFI_PUSH (REG)
|
||||
# define POP(REG) popl REG; CFI_POP (REG)
|
||||
|
||||
# define PARMS 4
|
||||
# define RETURN POP (%edi); ret; CFI_PUSH (%edi)
|
||||
# define STR1 PARMS
|
||||
# define STR2 STR1+4
|
||||
# define LEN STR2+4
|
||||
|
||||
atom_text_section
|
||||
ENTRY (__wcscpy_ssse3)
|
||||
mov STR1(%esp), %edx
|
||||
mov STR2(%esp), %ecx
|
||||
|
||||
cmp $0, (%ecx)
|
||||
jz L(ExitTail4)
|
||||
cmp $0, 4(%ecx)
|
||||
jz L(ExitTail8)
|
||||
cmp $0, 8(%ecx)
|
||||
jz L(ExitTail12)
|
||||
cmp $0, 12(%ecx)
|
||||
jz L(ExitTail16)
|
||||
|
||||
PUSH (%edi)
|
||||
mov %edx, %edi
|
||||
|
||||
PUSH (%esi)
|
||||
lea 16(%ecx), %esi
|
||||
|
||||
and $-16, %esi
|
||||
|
||||
pxor %xmm0, %xmm0
|
||||
pcmpeqd (%esi), %xmm0
|
||||
movdqu (%ecx), %xmm1
|
||||
movdqu %xmm1, (%edx)
|
||||
|
||||
pmovmskb %xmm0, %eax
|
||||
sub %ecx, %esi
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
mov %edx, %eax
|
||||
lea 16(%edx), %edx
|
||||
and $-16, %edx
|
||||
sub %edx, %eax
|
||||
|
||||
sub %eax, %ecx
|
||||
mov %ecx, %eax
|
||||
and $0xf, %eax
|
||||
mov $0, %esi
|
||||
|
||||
jz L(Align16Both)
|
||||
cmp $4, %eax
|
||||
je L(Shl4)
|
||||
cmp $8, %eax
|
||||
je L(Shl8)
|
||||
jmp L(Shl12)
|
||||
|
||||
L(Align16Both):
|
||||
movaps (%ecx), %xmm1
|
||||
movaps 16(%ecx), %xmm2
|
||||
movaps %xmm1, (%edx)
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%esi), %esi
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps 16(%ecx, %esi), %xmm3
|
||||
movaps %xmm2, (%edx, %esi)
|
||||
pcmpeqd %xmm3, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%esi), %esi
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps 16(%ecx, %esi), %xmm4
|
||||
movaps %xmm3, (%edx, %esi)
|
||||
pcmpeqd %xmm4, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%esi), %esi
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps 16(%ecx, %esi), %xmm1
|
||||
movaps %xmm4, (%edx, %esi)
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%esi), %esi
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps 16(%ecx, %esi), %xmm2
|
||||
movaps %xmm1, (%edx, %esi)
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%esi), %esi
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps 16(%ecx, %esi), %xmm3
|
||||
movaps %xmm2, (%edx, %esi)
|
||||
pcmpeqd %xmm3, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%esi), %esi
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps %xmm3, (%edx, %esi)
|
||||
mov %ecx, %eax
|
||||
lea 16(%ecx, %esi), %ecx
|
||||
and $-0x40, %ecx
|
||||
sub %ecx, %eax
|
||||
sub %eax, %edx
|
||||
|
||||
mov $-0x40, %esi
|
||||
|
||||
L(Aligned64Loop):
|
||||
movaps (%ecx), %xmm2
|
||||
movaps 32(%ecx), %xmm3
|
||||
movaps %xmm2, %xmm4
|
||||
movaps 16(%ecx), %xmm5
|
||||
movaps %xmm3, %xmm6
|
||||
movaps 48(%ecx), %xmm7
|
||||
pminub %xmm5, %xmm2
|
||||
pminub %xmm7, %xmm3
|
||||
pminub %xmm2, %xmm3
|
||||
lea 64(%edx), %edx
|
||||
pcmpeqd %xmm0, %xmm3
|
||||
lea 64(%ecx), %ecx
|
||||
pmovmskb %xmm3, %eax
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Aligned64Leave)
|
||||
movaps %xmm4, -64(%edx)
|
||||
movaps %xmm5, -48(%edx)
|
||||
movaps %xmm6, -32(%edx)
|
||||
movaps %xmm7, -16(%edx)
|
||||
jmp L(Aligned64Loop)
|
||||
|
||||
L(Aligned64Leave):
|
||||
pcmpeqd %xmm4, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
test %eax, %eax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
pcmpeqd %xmm5, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
movaps %xmm4, -64(%edx)
|
||||
test %eax, %eax
|
||||
lea 16(%esi), %esi
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
pcmpeqd %xmm6, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
movaps %xmm5, -48(%edx)
|
||||
test %eax, %eax
|
||||
lea 16(%esi), %esi
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps %xmm6, -32(%edx)
|
||||
pcmpeqd %xmm7, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
test %eax, %eax
|
||||
lea 16(%esi), %esi
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
mov $-0x40, %esi
|
||||
movaps %xmm7, -16(%edx)
|
||||
jmp L(Aligned64Loop)
|
||||
|
||||
.p2align 4
|
||||
L(Shl4):
|
||||
movaps -4(%ecx), %xmm1
|
||||
movaps 12(%ecx), %xmm2
|
||||
L(Shl4Start):
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Shl4LoopExit)
|
||||
|
||||
palignr $4, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%edx)
|
||||
movaps 28(%ecx), %xmm2
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%edx), %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%ecx), %ecx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Shl4LoopExit)
|
||||
|
||||
palignr $4, %xmm1, %xmm2
|
||||
movaps %xmm2, (%edx)
|
||||
movaps 28(%ecx), %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%edx), %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%ecx), %ecx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Shl4LoopExit)
|
||||
|
||||
palignr $4, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%edx)
|
||||
movaps 28(%ecx), %xmm2
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%edx), %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%ecx), %ecx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Shl4LoopExit)
|
||||
|
||||
palignr $4, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%edx)
|
||||
lea 28(%ecx), %ecx
|
||||
lea 16(%edx), %edx
|
||||
|
||||
mov %ecx, %eax
|
||||
and $-0x40, %ecx
|
||||
sub %ecx, %eax
|
||||
lea -12(%ecx), %ecx
|
||||
sub %eax, %edx
|
||||
|
||||
movaps -4(%ecx), %xmm1
|
||||
|
||||
L(Shl4LoopStart):
|
||||
movaps 12(%ecx), %xmm2
|
||||
movaps 28(%ecx), %xmm3
|
||||
movaps %xmm3, %xmm6
|
||||
movaps 44(%ecx), %xmm4
|
||||
movaps %xmm4, %xmm7
|
||||
movaps 60(%ecx), %xmm5
|
||||
pminub %xmm2, %xmm6
|
||||
pminub %xmm5, %xmm7
|
||||
pminub %xmm6, %xmm7
|
||||
pcmpeqd %xmm0, %xmm7
|
||||
pmovmskb %xmm7, %eax
|
||||
movaps %xmm5, %xmm7
|
||||
palignr $4, %xmm4, %xmm5
|
||||
test %eax, %eax
|
||||
palignr $4, %xmm3, %xmm4
|
||||
jnz L(Shl4Start)
|
||||
|
||||
palignr $4, %xmm2, %xmm3
|
||||
lea 64(%ecx), %ecx
|
||||
palignr $4, %xmm1, %xmm2
|
||||
movaps %xmm7, %xmm1
|
||||
movaps %xmm5, 48(%edx)
|
||||
movaps %xmm4, 32(%edx)
|
||||
movaps %xmm3, 16(%edx)
|
||||
movaps %xmm2, (%edx)
|
||||
lea 64(%edx), %edx
|
||||
jmp L(Shl4LoopStart)
|
||||
|
||||
L(Shl4LoopExit):
|
||||
movaps (%edx), %xmm6
|
||||
psrldq $12, %xmm6
|
||||
palignr $4, %xmm1, %xmm6
|
||||
movaps %xmm6, (%edx)
|
||||
add $12, %edx
|
||||
add $12, %ecx
|
||||
|
||||
POP (%esi)
|
||||
test %al, %al
|
||||
jz L(ExitHigh)
|
||||
test $0x01, %al
|
||||
jnz L(Exit4)
|
||||
movlpd (%ecx), %xmm0
|
||||
movlpd %xmm0, (%edx)
|
||||
movl %edi, %eax
|
||||
RETURN
|
||||
|
||||
CFI_PUSH (%esi)
|
||||
|
||||
.p2align 4
|
||||
L(Shl8):
|
||||
movaps -8(%ecx), %xmm1
|
||||
movaps 8(%ecx), %xmm2
|
||||
L(Shl8Start):
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Shl8LoopExit)
|
||||
|
||||
palignr $8, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%edx)
|
||||
movaps 24(%ecx), %xmm2
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%edx), %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%ecx), %ecx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Shl8LoopExit)
|
||||
|
||||
palignr $8, %xmm1, %xmm2
|
||||
movaps %xmm2, (%edx)
|
||||
movaps 24(%ecx), %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%edx), %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%ecx), %ecx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Shl8LoopExit)
|
||||
|
||||
palignr $8, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%edx)
|
||||
movaps 24(%ecx), %xmm2
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%edx), %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%ecx), %ecx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Shl8LoopExit)
|
||||
|
||||
palignr $8, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%edx)
|
||||
lea 24(%ecx), %ecx
|
||||
lea 16(%edx), %edx
|
||||
|
||||
mov %ecx, %eax
|
||||
and $-0x40, %ecx
|
||||
sub %ecx, %eax
|
||||
lea -8(%ecx), %ecx
|
||||
sub %eax, %edx
|
||||
|
||||
movaps -8(%ecx), %xmm1
|
||||
|
||||
L(Shl8LoopStart):
|
||||
movaps 8(%ecx), %xmm2
|
||||
movaps 24(%ecx), %xmm3
|
||||
movaps %xmm3, %xmm6
|
||||
movaps 40(%ecx), %xmm4
|
||||
movaps %xmm4, %xmm7
|
||||
movaps 56(%ecx), %xmm5
|
||||
pminub %xmm2, %xmm6
|
||||
pminub %xmm5, %xmm7
|
||||
pminub %xmm6, %xmm7
|
||||
pcmpeqd %xmm0, %xmm7
|
||||
pmovmskb %xmm7, %eax
|
||||
movaps %xmm5, %xmm7
|
||||
palignr $8, %xmm4, %xmm5
|
||||
test %eax, %eax
|
||||
palignr $8, %xmm3, %xmm4
|
||||
jnz L(Shl8Start)
|
||||
|
||||
palignr $8, %xmm2, %xmm3
|
||||
lea 64(%ecx), %ecx
|
||||
palignr $8, %xmm1, %xmm2
|
||||
movaps %xmm7, %xmm1
|
||||
movaps %xmm5, 48(%edx)
|
||||
movaps %xmm4, 32(%edx)
|
||||
movaps %xmm3, 16(%edx)
|
||||
movaps %xmm2, (%edx)
|
||||
lea 64(%edx), %edx
|
||||
jmp L(Shl8LoopStart)
|
||||
|
||||
L(Shl8LoopExit):
|
||||
movaps (%edx), %xmm6
|
||||
psrldq $8, %xmm6
|
||||
palignr $8, %xmm1, %xmm6
|
||||
movaps %xmm6, (%edx)
|
||||
add $8, %edx
|
||||
add $8, %ecx
|
||||
|
||||
POP (%esi)
|
||||
test %al, %al
|
||||
jz L(ExitHigh)
|
||||
test $0x01, %al
|
||||
jnz L(Exit4)
|
||||
movlpd (%ecx), %xmm0
|
||||
movlpd %xmm0, (%edx)
|
||||
movl %edi, %eax
|
||||
RETURN
|
||||
|
||||
CFI_PUSH (%esi)
|
||||
|
||||
.p2align 4
|
||||
L(Shl12):
|
||||
movaps -12(%ecx), %xmm1
|
||||
movaps 4(%ecx), %xmm2
|
||||
L(Shl12Start):
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Shl12LoopExit)
|
||||
|
||||
palignr $12, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%edx)
|
||||
movaps 20(%ecx), %xmm2
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%edx), %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%ecx), %ecx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Shl12LoopExit)
|
||||
|
||||
palignr $12, %xmm1, %xmm2
|
||||
movaps %xmm2, (%edx)
|
||||
movaps 20(%ecx), %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%edx), %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%ecx), %ecx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Shl12LoopExit)
|
||||
|
||||
palignr $12, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%edx)
|
||||
movaps 20(%ecx), %xmm2
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%edx), %edx
|
||||
pmovmskb %xmm0, %eax
|
||||
lea 16(%ecx), %ecx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(Shl12LoopExit)
|
||||
|
||||
palignr $12, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%edx)
|
||||
lea 20(%ecx), %ecx
|
||||
lea 16(%edx), %edx
|
||||
|
||||
mov %ecx, %eax
|
||||
and $-0x40, %ecx
|
||||
sub %ecx, %eax
|
||||
lea -4(%ecx), %ecx
|
||||
sub %eax, %edx
|
||||
|
||||
movaps -12(%ecx), %xmm1
|
||||
|
||||
L(Shl12LoopStart):
|
||||
movaps 4(%ecx), %xmm2
|
||||
movaps 20(%ecx), %xmm3
|
||||
movaps %xmm3, %xmm6
|
||||
movaps 36(%ecx), %xmm4
|
||||
movaps %xmm4, %xmm7
|
||||
movaps 52(%ecx), %xmm5
|
||||
pminub %xmm2, %xmm6
|
||||
pminub %xmm5, %xmm7
|
||||
pminub %xmm6, %xmm7
|
||||
pcmpeqd %xmm0, %xmm7
|
||||
pmovmskb %xmm7, %eax
|
||||
movaps %xmm5, %xmm7
|
||||
palignr $12, %xmm4, %xmm5
|
||||
test %eax, %eax
|
||||
palignr $12, %xmm3, %xmm4
|
||||
jnz L(Shl12Start)
|
||||
|
||||
palignr $12, %xmm2, %xmm3
|
||||
lea 64(%ecx), %ecx
|
||||
palignr $12, %xmm1, %xmm2
|
||||
movaps %xmm7, %xmm1
|
||||
movaps %xmm5, 48(%edx)
|
||||
movaps %xmm4, 32(%edx)
|
||||
movaps %xmm3, 16(%edx)
|
||||
movaps %xmm2, (%edx)
|
||||
lea 64(%edx), %edx
|
||||
jmp L(Shl12LoopStart)
|
||||
|
||||
L(Shl12LoopExit):
|
||||
movaps (%edx), %xmm6
|
||||
psrldq $4, %xmm6
|
||||
mov $4, %esi
|
||||
palignr $12, %xmm1, %xmm6
|
||||
movaps %xmm6, (%edx)
|
||||
|
||||
.p2align 4
|
||||
L(CopyFrom1To16Bytes):
|
||||
add %esi, %edx
|
||||
add %esi, %ecx
|
||||
|
||||
POP (%esi)
|
||||
test %al, %al
|
||||
jz L(ExitHigh)
|
||||
test $0x01, %al
|
||||
jnz L(Exit4)
|
||||
movlpd (%ecx), %xmm0
|
||||
movlpd %xmm0, (%edx)
|
||||
movl %edi, %eax
|
||||
RETURN
|
||||
|
||||
.p2align 4
|
||||
L(ExitHigh):
|
||||
test $0x01, %ah
|
||||
jnz L(Exit12)
|
||||
movdqu (%ecx), %xmm0
|
||||
movdqu %xmm0, (%edx)
|
||||
movl %edi, %eax
|
||||
RETURN
|
||||
|
||||
.p2align 4
|
||||
L(Exit4):
|
||||
movl (%ecx), %eax
|
||||
movl %eax, (%edx)
|
||||
movl %edi, %eax
|
||||
RETURN
|
||||
|
||||
.p2align 4
|
||||
L(Exit12):
|
||||
movlpd (%ecx), %xmm0
|
||||
movlpd %xmm0, (%edx)
|
||||
movl 8(%ecx), %eax
|
||||
movl %eax, 8(%edx)
|
||||
movl %edi, %eax
|
||||
RETURN
|
||||
|
||||
CFI_POP (%edi)
|
||||
|
||||
.p2align 4
|
||||
L(ExitTail4):
|
||||
movl (%ecx), %eax
|
||||
movl %eax, (%edx)
|
||||
movl %edx, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(ExitTail8):
|
||||
movlpd (%ecx), %xmm0
|
||||
movlpd %xmm0, (%edx)
|
||||
movl %edx, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(ExitTail12):
|
||||
movlpd (%ecx), %xmm0
|
||||
movlpd %xmm0, (%edx)
|
||||
movl 8(%ecx), %eax
|
||||
movl %eax, 8(%edx)
|
||||
movl %edx, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(ExitTail16):
|
||||
movdqu (%ecx), %xmm0
|
||||
movdqu %xmm0, (%edx)
|
||||
movl %edx, %eax
|
||||
ret
|
||||
|
||||
END (__wcscpy_ssse3)
|
||||
#endif
|
46
sysdeps/i386/i686/multiarch/wcscpy.S
Normal file
46
sysdeps/i386/i686/multiarch/wcscpy.S
Normal file
@ -0,0 +1,46 @@
|
||||
/* Multiple versions of wcscpy
|
||||
Copyright (C) 2011 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include <init-arch.h>
|
||||
|
||||
/* Define multiple versions only for the definition in libc. */
|
||||
#ifndef NOT_IN_libc
|
||||
.text
|
||||
ENTRY(wcscpy)
|
||||
.type wcscpy, @gnu_indirect_function
|
||||
pushl %ebx
|
||||
cfi_adjust_cfa_offset (4)
|
||||
cfi_rel_offset (ebx, 0)
|
||||
call __i686.get_pc_thunk.bx
|
||||
addl $_GLOBAL_OFFSET_TABLE_, %ebx
|
||||
cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
|
||||
jne 1f
|
||||
call __init_cpu_features
|
||||
1: leal __wcscpy_ia32@GOTOFF(%ebx), %eax
|
||||
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
|
||||
jz 2f
|
||||
leal __wcscpy_ssse3@GOTOFF(%ebx), %eax
|
||||
2: popl %ebx
|
||||
cfi_adjust_cfa_offset (-4)
|
||||
cfi_restore (ebx)
|
||||
ret
|
||||
END(wcscpy)
|
||||
#endif
|
5
sysdeps/i386/i686/multiarch/wcsrchr-c.c
Normal file
5
sysdeps/i386/i686/multiarch/wcsrchr-c.c
Normal file
@ -0,0 +1,5 @@
|
||||
#ifndef NOT_IN_libc
|
||||
# define wcsrchr __wcsrchr_ia32
|
||||
#endif
|
||||
|
||||
#include "wcsmbs/wcsrchr.c"
|
355
sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
Normal file
355
sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
Normal file
@ -0,0 +1,355 @@
|
||||
/* wcsrchr with SSE2, without using bsf instructions.
|
||||
Copyright (C) 2011 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#ifndef NOT_IN_libc
|
||||
# include <sysdep.h>
|
||||
# define CFI_PUSH(REG) \
|
||||
cfi_adjust_cfa_offset (4); \
|
||||
cfi_rel_offset (REG, 0)
|
||||
|
||||
# define CFI_POP(REG) \
|
||||
cfi_adjust_cfa_offset (-4); \
|
||||
cfi_restore (REG)
|
||||
|
||||
# define PUSH(REG) pushl REG; CFI_PUSH (REG)
|
||||
# define POP(REG) popl REG; CFI_POP (REG)
|
||||
|
||||
# define PARMS 8
|
||||
# define ENTRANCE PUSH (%edi);
|
||||
# define RETURN POP (%edi); ret; CFI_PUSH (%edi);
|
||||
# define STR1 PARMS
|
||||
# define STR2 STR1+4
|
||||
|
||||
atom_text_section
|
||||
ENTRY (__wcsrchr_sse2)
|
||||
|
||||
ENTRANCE
|
||||
mov STR1(%esp), %ecx
|
||||
movd STR2(%esp), %xmm1
|
||||
|
||||
mov %ecx, %edi
|
||||
punpckldq %xmm1, %xmm1
|
||||
pxor %xmm2, %xmm2
|
||||
punpckldq %xmm1, %xmm1
|
||||
|
||||
/* ECX has OFFSET. */
|
||||
and $63, %ecx
|
||||
cmp $48, %ecx
|
||||
ja L(crosscache)
|
||||
|
||||
/* unaligned string. */
|
||||
movdqu (%edi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
/* Find where NULL is. */
|
||||
pmovmskb %xmm2, %ecx
|
||||
/* Check if there is a match. */
|
||||
pmovmskb %xmm0, %eax
|
||||
add $16, %edi
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(unaligned_match1)
|
||||
|
||||
test %ecx, %ecx
|
||||
jnz L(return_null)
|
||||
|
||||
and $-16, %edi
|
||||
|
||||
PUSH (%esi)
|
||||
|
||||
xor %edx, %edx
|
||||
jmp L(loop)
|
||||
|
||||
CFI_POP (%esi)
|
||||
|
||||
.p2align 4
|
||||
L(unaligned_match1):
|
||||
test %ecx, %ecx
|
||||
jnz L(prolog_find_zero_1)
|
||||
|
||||
PUSH (%esi)
|
||||
|
||||
/* Save current match */
|
||||
mov %eax, %edx
|
||||
mov %edi, %esi
|
||||
and $-16, %edi
|
||||
jmp L(loop)
|
||||
|
||||
CFI_POP (%esi)
|
||||
|
||||
.p2align 4
|
||||
L(crosscache):
|
||||
/* Hancle unaligned string. */
|
||||
and $15, %ecx
|
||||
and $-16, %edi
|
||||
pxor %xmm3, %xmm3
|
||||
movdqa (%edi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm3
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
/* Find where NULL is. */
|
||||
pmovmskb %xmm3, %edx
|
||||
/* Check if there is a match. */
|
||||
pmovmskb %xmm0, %eax
|
||||
/* Remove the leading bytes. */
|
||||
shr %cl, %edx
|
||||
shr %cl, %eax
|
||||
add $16, %edi
|
||||
|
||||
test %eax, %eax
|
||||
jnz L(unaligned_match)
|
||||
|
||||
test %edx, %edx
|
||||
jnz L(return_null)
|
||||
|
||||
PUSH (%esi)
|
||||
|
||||
xor %edx, %edx
|
||||
jmp L(loop)
|
||||
|
||||
CFI_POP (%esi)
|
||||
|
||||
.p2align 4
|
||||
L(unaligned_match):
|
||||
test %edx, %edx
|
||||
jnz L(prolog_find_zero)
|
||||
|
||||
PUSH (%esi)
|
||||
|
||||
mov %eax, %edx
|
||||
lea (%edi, %ecx), %esi
|
||||
|
||||
/* Loop start on aligned string. */
|
||||
.p2align 4
|
||||
L(loop):
|
||||
movdqa (%edi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
add $16, %edi
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %ecx
|
||||
pmovmskb %xmm0, %eax
|
||||
or %eax, %ecx
|
||||
jnz L(matches)
|
||||
|
||||
movdqa (%edi), %xmm3
|
||||
pcmpeqd %xmm3, %xmm2
|
||||
add $16, %edi
|
||||
pcmpeqd %xmm1, %xmm3
|
||||
pmovmskb %xmm2, %ecx
|
||||
pmovmskb %xmm3, %eax
|
||||
or %eax, %ecx
|
||||
jnz L(matches)
|
||||
|
||||
movdqa (%edi), %xmm4
|
||||
pcmpeqd %xmm4, %xmm2
|
||||
add $16, %edi
|
||||
pcmpeqd %xmm1, %xmm4
|
||||
pmovmskb %xmm2, %ecx
|
||||
pmovmskb %xmm4, %eax
|
||||
or %eax, %ecx
|
||||
jnz L(matches)
|
||||
|
||||
movdqa (%edi), %xmm5
|
||||
pcmpeqd %xmm5, %xmm2
|
||||
add $16, %edi
|
||||
pcmpeqd %xmm1, %xmm5
|
||||
pmovmskb %xmm2, %ecx
|
||||
pmovmskb %xmm5, %eax
|
||||
or %eax, %ecx
|
||||
jz L(loop)
|
||||
|
||||
.p2align 4
|
||||
L(matches):
|
||||
test %eax, %eax
|
||||
jnz L(match)
|
||||
L(return_value):
|
||||
test %edx, %edx
|
||||
jz L(return_null_1)
|
||||
mov %edx, %eax
|
||||
mov %esi, %edi
|
||||
|
||||
POP (%esi)
|
||||
|
||||
test %ah, %ah
|
||||
jnz L(match_third_or_fourth_wchar)
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%edi), %eax
|
||||
RETURN
|
||||
|
||||
CFI_PUSH (%esi)
|
||||
|
||||
.p2align 4
|
||||
L(return_null_1):
|
||||
POP (%esi)
|
||||
|
||||
xor %eax, %eax
|
||||
RETURN
|
||||
|
||||
CFI_PUSH (%esi)
|
||||
|
||||
.p2align 4
|
||||
L(match):
|
||||
pmovmskb %xmm2, %ecx
|
||||
test %ecx, %ecx
|
||||
jnz L(find_zero)
|
||||
/* save match info */
|
||||
mov %eax, %edx
|
||||
mov %edi, %esi
|
||||
jmp L(loop)
|
||||
|
||||
.p2align 4
|
||||
L(find_zero):
|
||||
test %cl, %cl
|
||||
jz L(find_zero_in_third_or_fourth_wchar)
|
||||
test $15, %cl
|
||||
jz L(find_zero_in_second_wchar)
|
||||
and $1, %eax
|
||||
jz L(return_value)
|
||||
|
||||
POP (%esi)
|
||||
|
||||
lea -16(%edi), %eax
|
||||
RETURN
|
||||
|
||||
CFI_PUSH (%esi)
|
||||
|
||||
.p2align 4
|
||||
L(find_zero_in_second_wchar):
|
||||
and $1 << 5 - 1, %eax
|
||||
jz L(return_value)
|
||||
|
||||
POP (%esi)
|
||||
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%edi), %eax
|
||||
RETURN
|
||||
|
||||
CFI_PUSH (%esi)
|
||||
|
||||
.p2align 4
|
||||
L(find_zero_in_third_or_fourth_wchar):
|
||||
test $15, %ch
|
||||
jz L(find_zero_in_fourth_wchar)
|
||||
and $1 << 9 - 1, %eax
|
||||
jz L(return_value)
|
||||
|
||||
POP (%esi)
|
||||
|
||||
test %ah, %ah
|
||||
jnz L(match_third_wchar)
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%edi), %eax
|
||||
RETURN
|
||||
|
||||
CFI_PUSH (%esi)
|
||||
|
||||
.p2align 4
|
||||
L(find_zero_in_fourth_wchar):
|
||||
|
||||
POP (%esi)
|
||||
|
||||
test %ah, %ah
|
||||
jnz L(match_third_or_fourth_wchar)
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%edi), %eax
|
||||
RETURN
|
||||
|
||||
CFI_PUSH (%esi)
|
||||
|
||||
.p2align 4
|
||||
L(match_second_wchar):
|
||||
lea -12(%edi), %eax
|
||||
RETURN
|
||||
|
||||
.p2align 4
|
||||
L(match_third_or_fourth_wchar):
|
||||
test $15 << 4, %ah
|
||||
jnz L(match_fourth_wchar)
|
||||
lea -8(%edi), %eax
|
||||
RETURN
|
||||
|
||||
.p2align 4
|
||||
L(match_third_wchar):
|
||||
lea -8(%edi), %eax
|
||||
RETURN
|
||||
|
||||
.p2align 4
|
||||
L(match_fourth_wchar):
|
||||
lea -4(%edi), %eax
|
||||
RETURN
|
||||
|
||||
.p2align 4
|
||||
L(return_null):
|
||||
xor %eax, %eax
|
||||
RETURN
|
||||
|
||||
.p2align 4
|
||||
L(prolog_find_zero):
|
||||
add %ecx, %edi
|
||||
mov %edx, %ecx
|
||||
L(prolog_find_zero_1):
|
||||
test %cl, %cl
|
||||
jz L(prolog_find_zero_in_third_or_fourth_wchar)
|
||||
test $15, %cl
|
||||
jz L(prolog_find_zero_in_second_wchar)
|
||||
and $1, %eax
|
||||
jz L(return_null)
|
||||
|
||||
lea -16(%edi), %eax
|
||||
RETURN
|
||||
|
||||
.p2align 4
|
||||
L(prolog_find_zero_in_second_wchar):
|
||||
and $1 << 5 - 1, %eax
|
||||
jz L(return_null)
|
||||
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%edi), %eax
|
||||
RETURN
|
||||
|
||||
.p2align 4
|
||||
L(prolog_find_zero_in_third_or_fourth_wchar):
|
||||
test $15, %ch
|
||||
jz L(prolog_find_zero_in_fourth_wchar)
|
||||
and $1 << 9 - 1, %eax
|
||||
jz L(return_null)
|
||||
|
||||
test %ah, %ah
|
||||
jnz L(match_third_wchar)
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%edi), %eax
|
||||
RETURN
|
||||
|
||||
.p2align 4
|
||||
L(prolog_find_zero_in_fourth_wchar):
|
||||
test %ah, %ah
|
||||
jnz L(match_third_or_fourth_wchar)
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%edi), %eax
|
||||
RETURN
|
||||
|
||||
END (__wcsrchr_sse2)
|
||||
#endif
|
54
sysdeps/i386/i686/multiarch/wcsrchr.S
Normal file
54
sysdeps/i386/i686/multiarch/wcsrchr.S
Normal file
@ -0,0 +1,54 @@
|
||||
/* Multiple versions of wcsrchr
|
||||
Copyright (C) 2011 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include <init-arch.h>
|
||||
|
||||
#ifndef NOT_IN_libc
|
||||
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
|
||||
.globl __i686.get_pc_thunk.bx
|
||||
.hidden __i686.get_pc_thunk.bx
|
||||
.p2align 4
|
||||
.type __i686.get_pc_thunk.bx,@function
|
||||
__i686.get_pc_thunk.bx:
|
||||
movl (%esp), %ebx
|
||||
ret
|
||||
|
||||
.text
|
||||
ENTRY(wcsrchr)
|
||||
.type wcsrchr, @gnu_indirect_function
|
||||
pushl %ebx
|
||||
cfi_adjust_cfa_offset (4)
|
||||
cfi_rel_offset (ebx, 0)
|
||||
call __i686.get_pc_thunk.bx
|
||||
addl $_GLOBAL_OFFSET_TABLE_, %ebx
|
||||
cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
|
||||
jne 1f
|
||||
call __init_cpu_features
|
||||
1: leal __wcsrchr_ia32@GOTOFF(%ebx), %eax
|
||||
testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
|
||||
jz 2f
|
||||
leal __wcsrchr_sse2@GOTOFF(%ebx), %eax
|
||||
2: popl %ebx
|
||||
cfi_adjust_cfa_offset (-4);
|
||||
cfi_restore (ebx)
|
||||
ret
|
||||
END(wcsrchr)
|
||||
#endif
|
@ -16,7 +16,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
|
||||
strcat-sse2-unaligned strncat-sse2-unaligned \
|
||||
strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
|
||||
strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
|
||||
memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
|
||||
memcmp-ssse3
|
||||
ifeq (yes,$(config-cflags-sse4))
|
||||
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
|
||||
CFLAGS-varshift.c += -msse4
|
||||
@ -28,3 +28,7 @@ CFLAGS-strcasestr.c += -msse4
|
||||
CFLAGS-strcasestr-nonascii.c += -msse4
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
|
||||
endif
|
||||
|
5
sysdeps/x86_64/multiarch/wcscpy-c.c
Normal file
5
sysdeps/x86_64/multiarch/wcscpy-c.c
Normal file
@ -0,0 +1,5 @@
|
||||
#ifndef NOT_IN_libc
|
||||
# define wcscpy __wcscpy_sse2
|
||||
#endif
|
||||
|
||||
#include "wcsmbs/wcscpy.c"
|
566
sysdeps/x86_64/multiarch/wcscpy-ssse3.S
Normal file
566
sysdeps/x86_64/multiarch/wcscpy-ssse3.S
Normal file
@ -0,0 +1,566 @@
|
||||
/* wcscpy with SSSE3
|
||||
Copyright (C) 2011 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#ifndef NOT_IN_libc
|
||||
# include <sysdep.h>
|
||||
|
||||
.text
|
||||
ENTRY (__wcscpy_ssse3)
|
||||
mov %rsi, %rcx
|
||||
mov %rdi, %rdx
|
||||
|
||||
cmpl $0, (%rcx)
|
||||
jz L(Exit4)
|
||||
cmpl $0, 4(%rcx)
|
||||
jz L(Exit8)
|
||||
cmpl $0, 8(%rcx)
|
||||
jz L(Exit12)
|
||||
cmpl $0, 12(%rcx)
|
||||
jz L(Exit16)
|
||||
|
||||
lea 16(%rcx), %rsi
|
||||
and $-16, %rsi
|
||||
|
||||
pxor %xmm0, %xmm0
|
||||
mov (%rcx), %r9
|
||||
mov %r9, (%rdx)
|
||||
|
||||
pcmpeqd (%rsi), %xmm0
|
||||
mov 8(%rcx), %r9
|
||||
mov %r9, 8(%rdx)
|
||||
|
||||
pmovmskb %xmm0, %rax
|
||||
sub %rcx, %rsi
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
mov %rdx, %rax
|
||||
lea 16(%rdx), %rdx
|
||||
and $-16, %rdx
|
||||
sub %rdx, %rax
|
||||
sub %rax, %rcx
|
||||
mov %rcx, %rax
|
||||
and $0xf, %rax
|
||||
mov $0, %rsi
|
||||
|
||||
/* case: rcx_offset == rdx_offset */
|
||||
|
||||
jz L(Align16Both)
|
||||
|
||||
cmp $4, %rax
|
||||
je L(Shl4)
|
||||
cmp $8, %rax
|
||||
je L(Shl8)
|
||||
jmp L(Shl12)
|
||||
|
||||
L(Align16Both):
|
||||
movaps (%rcx), %xmm1
|
||||
movaps 16(%rcx), %xmm2
|
||||
movaps %xmm1, (%rdx)
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rsi), %rsi
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps 16(%rcx, %rsi), %xmm3
|
||||
movaps %xmm2, (%rdx, %rsi)
|
||||
pcmpeqd %xmm3, %xmm0
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rsi), %rsi
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps 16(%rcx, %rsi), %xmm4
|
||||
movaps %xmm3, (%rdx, %rsi)
|
||||
pcmpeqd %xmm4, %xmm0
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rsi), %rsi
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps 16(%rcx, %rsi), %xmm1
|
||||
movaps %xmm4, (%rdx, %rsi)
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rsi), %rsi
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps 16(%rcx, %rsi), %xmm2
|
||||
movaps %xmm1, (%rdx, %rsi)
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rsi), %rsi
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps 16(%rcx, %rsi), %xmm3
|
||||
movaps %xmm2, (%rdx, %rsi)
|
||||
pcmpeqd %xmm3, %xmm0
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rsi), %rsi
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps %xmm3, (%rdx, %rsi)
|
||||
mov %rcx, %rax
|
||||
lea 16(%rcx, %rsi), %rcx
|
||||
and $-0x40, %rcx
|
||||
sub %rcx, %rax
|
||||
sub %rax, %rdx
|
||||
|
||||
mov $-0x40, %rsi
|
||||
|
||||
L(Aligned64Loop):
|
||||
movaps (%rcx), %xmm2
|
||||
movaps %xmm2, %xmm4
|
||||
movaps 16(%rcx), %xmm5
|
||||
movaps 32(%rcx), %xmm3
|
||||
movaps %xmm3, %xmm6
|
||||
movaps 48(%rcx), %xmm7
|
||||
pminub %xmm5, %xmm2
|
||||
pminub %xmm7, %xmm3
|
||||
pminub %xmm2, %xmm3
|
||||
pcmpeqd %xmm0, %xmm3
|
||||
pmovmskb %xmm3, %rax
|
||||
lea 64(%rdx), %rdx
|
||||
lea 64(%rcx), %rcx
|
||||
test %rax, %rax
|
||||
jnz L(Aligned64Leave)
|
||||
movaps %xmm4, -64(%rdx)
|
||||
movaps %xmm5, -48(%rdx)
|
||||
movaps %xmm6, -32(%rdx)
|
||||
movaps %xmm7, -16(%rdx)
|
||||
jmp L(Aligned64Loop)
|
||||
|
||||
L(Aligned64Leave):
|
||||
pcmpeqd %xmm4, %xmm0
|
||||
pmovmskb %xmm0, %rax
|
||||
test %rax, %rax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
pcmpeqd %xmm5, %xmm0
|
||||
|
||||
pmovmskb %xmm0, %rax
|
||||
movaps %xmm4, -64(%rdx)
|
||||
test %rax, %rax
|
||||
lea 16(%rsi), %rsi
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
pcmpeqd %xmm6, %xmm0
|
||||
|
||||
pmovmskb %xmm0, %rax
|
||||
movaps %xmm5, -48(%rdx)
|
||||
test %rax, %rax
|
||||
lea 16(%rsi), %rsi
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
movaps %xmm6, -32(%rdx)
|
||||
pcmpeqd %xmm7, %xmm0
|
||||
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rsi), %rsi
|
||||
test %rax, %rax
|
||||
jnz L(CopyFrom1To16Bytes)
|
||||
|
||||
mov $-0x40, %rsi
|
||||
movaps %xmm7, -16(%rdx)
|
||||
jmp L(Aligned64Loop)
|
||||
|
||||
.p2align 4
|
||||
L(Shl4):
|
||||
movaps -4(%rcx), %xmm1
|
||||
movaps 12(%rcx), %xmm2
|
||||
L(Shl4Start):
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
pmovmskb %xmm0, %rax
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(Shl4LoopExit)
|
||||
|
||||
palignr $4, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%rdx)
|
||||
movaps 28(%rcx), %xmm2
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%rdx), %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rcx), %rcx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(Shl4LoopExit)
|
||||
|
||||
palignr $4, %xmm1, %xmm2
|
||||
movaps %xmm2, (%rdx)
|
||||
movaps 28(%rcx), %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%rdx), %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rcx), %rcx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(Shl4LoopExit)
|
||||
|
||||
palignr $4, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%rdx)
|
||||
movaps 28(%rcx), %xmm2
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%rdx), %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rcx), %rcx
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(Shl4LoopExit)
|
||||
|
||||
palignr $4, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%rdx)
|
||||
lea 28(%rcx), %rcx
|
||||
lea 16(%rdx), %rdx
|
||||
|
||||
mov %rcx, %rax
|
||||
and $-0x40, %rcx
|
||||
sub %rcx, %rax
|
||||
lea -12(%rcx), %rcx
|
||||
sub %rax, %rdx
|
||||
|
||||
movaps -4(%rcx), %xmm1
|
||||
|
||||
L(Shl4LoopStart):
|
||||
movaps 12(%rcx), %xmm2
|
||||
movaps 28(%rcx), %xmm3
|
||||
movaps %xmm3, %xmm6
|
||||
movaps 44(%rcx), %xmm4
|
||||
movaps %xmm4, %xmm7
|
||||
movaps 60(%rcx), %xmm5
|
||||
pminub %xmm2, %xmm6
|
||||
pminub %xmm5, %xmm7
|
||||
pminub %xmm6, %xmm7
|
||||
pcmpeqd %xmm0, %xmm7
|
||||
pmovmskb %xmm7, %rax
|
||||
movaps %xmm5, %xmm7
|
||||
palignr $4, %xmm4, %xmm5
|
||||
test %rax, %rax
|
||||
palignr $4, %xmm3, %xmm4
|
||||
jnz L(Shl4Start)
|
||||
|
||||
palignr $4, %xmm2, %xmm3
|
||||
lea 64(%rcx), %rcx
|
||||
palignr $4, %xmm1, %xmm2
|
||||
movaps %xmm7, %xmm1
|
||||
movaps %xmm5, 48(%rdx)
|
||||
movaps %xmm4, 32(%rdx)
|
||||
movaps %xmm3, 16(%rdx)
|
||||
movaps %xmm2, (%rdx)
|
||||
lea 64(%rdx), %rdx
|
||||
jmp L(Shl4LoopStart)
|
||||
|
||||
L(Shl4LoopExit):
|
||||
movaps (%rdx), %xmm6
|
||||
psrldq $12, %xmm6
|
||||
mov $12, %rsi
|
||||
palignr $4, %xmm1, %xmm6
|
||||
movaps %xmm6, (%rdx)
|
||||
jmp L(CopyFrom1To16Bytes)
|
||||
|
||||
.p2align 4
|
||||
L(Shl8):
|
||||
movaps -8(%rcx), %xmm1
|
||||
movaps 8(%rcx), %xmm2
|
||||
L(Shl8Start):
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
pmovmskb %xmm0, %rax
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(Shl8LoopExit)
|
||||
|
||||
palignr $8, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%rdx)
|
||||
movaps 24(%rcx), %xmm2
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%rdx), %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rcx), %rcx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(Shl8LoopExit)
|
||||
|
||||
palignr $8, %xmm1, %xmm2
|
||||
movaps %xmm2, (%rdx)
|
||||
movaps 24(%rcx), %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%rdx), %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rcx), %rcx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(Shl8LoopExit)
|
||||
|
||||
palignr $8, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%rdx)
|
||||
movaps 24(%rcx), %xmm2
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%rdx), %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rcx), %rcx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(Shl8LoopExit)
|
||||
|
||||
palignr $8, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%rdx)
|
||||
lea 24(%rcx), %rcx
|
||||
lea 16(%rdx), %rdx
|
||||
|
||||
mov %rcx, %rax
|
||||
and $-0x40, %rcx
|
||||
sub %rcx, %rax
|
||||
lea -8(%rcx), %rcx
|
||||
sub %rax, %rdx
|
||||
|
||||
movaps -8(%rcx), %xmm1
|
||||
|
||||
L(Shl8LoopStart):
|
||||
movaps 8(%rcx), %xmm2
|
||||
movaps 24(%rcx), %xmm3
|
||||
movaps %xmm3, %xmm6
|
||||
movaps 40(%rcx), %xmm4
|
||||
movaps %xmm4, %xmm7
|
||||
movaps 56(%rcx), %xmm5
|
||||
pminub %xmm2, %xmm6
|
||||
pminub %xmm5, %xmm7
|
||||
pminub %xmm6, %xmm7
|
||||
pcmpeqd %xmm0, %xmm7
|
||||
pmovmskb %xmm7, %rax
|
||||
movaps %xmm5, %xmm7
|
||||
palignr $8, %xmm4, %xmm5
|
||||
test %rax, %rax
|
||||
palignr $8, %xmm3, %xmm4
|
||||
jnz L(Shl8Start)
|
||||
|
||||
palignr $8, %xmm2, %xmm3
|
||||
lea 64(%rcx), %rcx
|
||||
palignr $8, %xmm1, %xmm2
|
||||
movaps %xmm7, %xmm1
|
||||
movaps %xmm5, 48(%rdx)
|
||||
movaps %xmm4, 32(%rdx)
|
||||
movaps %xmm3, 16(%rdx)
|
||||
movaps %xmm2, (%rdx)
|
||||
lea 64(%rdx), %rdx
|
||||
jmp L(Shl8LoopStart)
|
||||
|
||||
L(Shl8LoopExit):
|
||||
movaps (%rdx), %xmm6
|
||||
psrldq $8, %xmm6
|
||||
mov $8, %rsi
|
||||
palignr $8, %xmm1, %xmm6
|
||||
movaps %xmm6, (%rdx)
|
||||
jmp L(CopyFrom1To16Bytes)
|
||||
|
||||
.p2align 4
|
||||
L(Shl12):
|
||||
movaps -12(%rcx), %xmm1
|
||||
movaps 4(%rcx), %xmm2
|
||||
L(Shl12Start):
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
pmovmskb %xmm0, %rax
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(Shl12LoopExit)
|
||||
|
||||
palignr $12, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%rdx)
|
||||
movaps 20(%rcx), %xmm2
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%rdx), %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rcx), %rcx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(Shl12LoopExit)
|
||||
|
||||
palignr $12, %xmm1, %xmm2
|
||||
movaps %xmm2, (%rdx)
|
||||
movaps 20(%rcx), %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%rdx), %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rcx), %rcx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(Shl12LoopExit)
|
||||
|
||||
palignr $12, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%rdx)
|
||||
movaps 20(%rcx), %xmm2
|
||||
|
||||
pcmpeqd %xmm2, %xmm0
|
||||
lea 16(%rdx), %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
lea 16(%rcx), %rcx
|
||||
movaps %xmm2, %xmm3
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(Shl12LoopExit)
|
||||
|
||||
palignr $12, %xmm1, %xmm2
|
||||
movaps %xmm3, %xmm1
|
||||
movaps %xmm2, (%rdx)
|
||||
lea 20(%rcx), %rcx
|
||||
lea 16(%rdx), %rdx
|
||||
|
||||
mov %rcx, %rax
|
||||
and $-0x40, %rcx
|
||||
sub %rcx, %rax
|
||||
lea -4(%rcx), %rcx
|
||||
sub %rax, %rdx
|
||||
|
||||
movaps -12(%rcx), %xmm1
|
||||
|
||||
L(Shl12LoopStart):
|
||||
movaps 4(%rcx), %xmm2
|
||||
movaps 20(%rcx), %xmm3
|
||||
movaps %xmm3, %xmm6
|
||||
movaps 36(%rcx), %xmm4
|
||||
movaps %xmm4, %xmm7
|
||||
movaps 52(%rcx), %xmm5
|
||||
pminub %xmm2, %xmm6
|
||||
pminub %xmm5, %xmm7
|
||||
pminub %xmm6, %xmm7
|
||||
pcmpeqd %xmm0, %xmm7
|
||||
pmovmskb %xmm7, %rax
|
||||
movaps %xmm5, %xmm7
|
||||
palignr $12, %xmm4, %xmm5
|
||||
test %rax, %rax
|
||||
palignr $12, %xmm3, %xmm4
|
||||
jnz L(Shl12Start)
|
||||
palignr $12, %xmm2, %xmm3
|
||||
lea 64(%rcx), %rcx
|
||||
palignr $12, %xmm1, %xmm2
|
||||
movaps %xmm7, %xmm1
|
||||
movaps %xmm5, 48(%rdx)
|
||||
movaps %xmm4, 32(%rdx)
|
||||
movaps %xmm3, 16(%rdx)
|
||||
movaps %xmm2, (%rdx)
|
||||
lea 64(%rdx), %rdx
|
||||
jmp L(Shl12LoopStart)
|
||||
|
||||
L(Shl12LoopExit):
|
||||
movaps (%rdx), %xmm6
|
||||
psrldq $4, %xmm6
|
||||
mov $4, %rsi
|
||||
palignr $12, %xmm1, %xmm6
|
||||
movaps %xmm6, (%rdx)
|
||||
|
||||
.p2align 4
|
||||
L(CopyFrom1To16Bytes):
|
||||
add %rsi, %rdx
|
||||
add %rsi, %rcx
|
||||
|
||||
test %al, %al
|
||||
jz L(ExitHigh)
|
||||
test $0x01, %al
|
||||
jnz L(Exit4)
|
||||
|
||||
mov (%rcx), %rax
|
||||
mov %rax, (%rdx)
|
||||
mov %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(ExitHigh):
|
||||
test $0x01, %ah
|
||||
jnz L(Exit12)
|
||||
|
||||
mov (%rcx), %rax
|
||||
mov %rax, (%rdx)
|
||||
mov 8(%rcx), %rax
|
||||
mov %rax, 8(%rdx)
|
||||
mov %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(Exit4):
|
||||
movl (%rcx), %eax
|
||||
movl %eax, (%rdx)
|
||||
mov %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(Exit8):
|
||||
mov (%rcx), %rax
|
||||
mov %rax, (%rdx)
|
||||
mov %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(Exit12):
|
||||
mov (%rcx), %rax
|
||||
mov %rax, (%rdx)
|
||||
mov 8(%rcx), %eax
|
||||
mov %eax, 8(%rdx)
|
||||
mov %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(Exit16):
|
||||
movdqu (%rcx), %xmm0
|
||||
movdqu %xmm0, (%rdx)
|
||||
mov %rdi, %rax
|
||||
ret
|
||||
|
||||
END(__wcscpy_ssse3)
|
||||
#endif
|
||||
|
43
sysdeps/x86_64/multiarch/wcscpy.S
Normal file
43
sysdeps/x86_64/multiarch/wcscpy.S
Normal file
@ -0,0 +1,43 @@
|
||||
/* Multiple versions of wcscpy
|
||||
Copyright (C) 2011 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <sysdep.h>
|
||||
#include <init-arch.h>
|
||||
|
||||
/* Define multiple versions only for the definition in libc. */
|
||||
#ifndef NOT_IN_libc
|
||||
|
||||
.text
|
||||
ENTRY(wcscpy)
|
||||
.type wcscpy, @gnu_indirect_function
|
||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||
jne 1f
|
||||
call __init_cpu_features
|
||||
|
||||
1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
|
||||
jnz 2f
|
||||
leaq __wcscpy_sse2(%rip), %rax
|
||||
ret
|
||||
|
||||
2: leaq __wcscpy_ssse3(%rip), %rax
|
||||
ret
|
||||
|
||||
END(wcscpy)
|
||||
#endif
|
155
sysdeps/x86_64/wcschr.S
Normal file
155
sysdeps/x86_64/wcschr.S
Normal file
@ -0,0 +1,155 @@
|
||||
/* wcschr with SSSE3
|
||||
Copyright (C) 2011 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.text
|
||||
ENTRY (wcschr)
|
||||
|
||||
movd %rsi, %xmm1
|
||||
pxor %xmm2, %xmm2
|
||||
mov %rdi, %rcx
|
||||
punpckldq %xmm1, %xmm1
|
||||
punpckldq %xmm1, %xmm1
|
||||
|
||||
and $63, %rcx
|
||||
cmp $48, %rcx
|
||||
ja L(cross_cache)
|
||||
|
||||
movdqu (%rdi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
add $16, %rdi
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
or %rax, %rdx
|
||||
jnz L(matches)
|
||||
|
||||
and $-16, %rdi
|
||||
|
||||
movdqa (%rdi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
add $16, %rdi
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
or %rax, %rdx
|
||||
jnz L(matches)
|
||||
|
||||
jmp L(loop)
|
||||
|
||||
L(cross_cache):
|
||||
and $15, %rcx
|
||||
and $-16, %rdi
|
||||
movdqa (%rdi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
|
||||
sar %cl, %rdx
|
||||
sar %cl, %rax
|
||||
test %rax, %rax
|
||||
je L(unaligned_no_match)
|
||||
|
||||
bsf %rax, %rax
|
||||
test %rdx, %rdx
|
||||
je L(unaligned_match)
|
||||
bsf %rdx, %rdx
|
||||
cmp %rdx, %rax
|
||||
ja L(return_null)
|
||||
|
||||
L(unaligned_match):
|
||||
add %rdi, %rax
|
||||
add %rcx, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(unaligned_no_match):
|
||||
test %rdx, %rdx
|
||||
jne L(return_null)
|
||||
pxor %xmm2, %xmm2
|
||||
|
||||
add $16, %rdi
|
||||
|
||||
.p2align 4
|
||||
/* Loop start on aligned string. */
|
||||
L(loop):
|
||||
movdqa (%rdi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
add $16, %rdi
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
or %rax, %rdx
|
||||
jnz L(matches)
|
||||
|
||||
movdqa (%rdi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
add $16, %rdi
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
or %rax, %rdx
|
||||
jnz L(matches)
|
||||
|
||||
movdqa (%rdi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
add $16, %rdi
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
or %rax, %rdx
|
||||
jnz L(matches)
|
||||
|
||||
movdqa (%rdi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
add $16, %rdi
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
or %rax, %rdx
|
||||
jnz L(matches)
|
||||
jmp L(loop)
|
||||
|
||||
.p2align 4
|
||||
L(matches):
|
||||
pmovmskb %xmm2, %rdx
|
||||
test %rax, %rax
|
||||
jz L(return_null)
|
||||
bsf %rax, %rax
|
||||
test %rdx, %rdx
|
||||
je L(match)
|
||||
bsf %rdx, %rcx
|
||||
cmp %rcx, %rax
|
||||
ja L(return_null)
|
||||
L(match):
|
||||
sub $16, %rdi
|
||||
add %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(return_null):
|
||||
xor %rax, %rax
|
||||
ret
|
||||
|
||||
END (wcschr)
|
||||
|
||||
libc_hidden_def(wcschr)
|
283
sysdeps/x86_64/wcsrchr.S
Normal file
283
sysdeps/x86_64/wcsrchr.S
Normal file
@ -0,0 +1,283 @@
|
||||
/* wcsrchr with SSSE3
|
||||
Copyright (C) 2011 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.text
|
||||
ENTRY (wcsrchr)
|
||||
|
||||
movd %rsi, %xmm1
|
||||
mov %rdi, %rcx
|
||||
punpckldq %xmm1, %xmm1
|
||||
pxor %xmm2, %xmm2
|
||||
punpckldq %xmm1, %xmm1
|
||||
and $63, %rcx
|
||||
cmp $48, %rcx
|
||||
ja L(crosscache)
|
||||
|
||||
movdqu (%rdi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %rcx
|
||||
pmovmskb %xmm0, %rax
|
||||
add $16, %rdi
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(unaligned_match1)
|
||||
|
||||
test %rcx, %rcx
|
||||
jnz L(return_null)
|
||||
|
||||
and $-16, %rdi
|
||||
xor %r8, %r8
|
||||
jmp L(loop)
|
||||
|
||||
.p2align 4
|
||||
L(unaligned_match1):
|
||||
test %rcx, %rcx
|
||||
jnz L(prolog_find_zero_1)
|
||||
|
||||
mov %rax, %r8
|
||||
mov %rdi, %rsi
|
||||
and $-16, %rdi
|
||||
jmp L(loop)
|
||||
|
||||
.p2align 4
|
||||
L(crosscache):
|
||||
and $15, %rcx
|
||||
and $-16, %rdi
|
||||
pxor %xmm3, %xmm3
|
||||
movdqa (%rdi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm3
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm3, %rdx
|
||||
pmovmskb %xmm0, %rax
|
||||
shr %cl, %rdx
|
||||
shr %cl, %rax
|
||||
add $16, %rdi
|
||||
|
||||
test %rax, %rax
|
||||
jnz L(unaligned_match)
|
||||
|
||||
test %rdx, %rdx
|
||||
jnz L(return_null)
|
||||
|
||||
xor %r8, %r8
|
||||
jmp L(loop)
|
||||
|
||||
.p2align 4
|
||||
L(unaligned_match):
|
||||
test %rdx, %rdx
|
||||
jnz L(prolog_find_zero)
|
||||
|
||||
mov %rax, %r8
|
||||
lea (%rdi, %rcx), %rsi
|
||||
|
||||
/* Loop start on aligned string. */
|
||||
.p2align 4
|
||||
L(loop):
|
||||
movdqa (%rdi), %xmm0
|
||||
pcmpeqd %xmm0, %xmm2
|
||||
add $16, %rdi
|
||||
pcmpeqd %xmm1, %xmm0
|
||||
pmovmskb %xmm2, %rcx
|
||||
pmovmskb %xmm0, %rax
|
||||
or %rax, %rcx
|
||||
jnz L(matches)
|
||||
|
||||
movdqa (%rdi), %xmm3
|
||||
pcmpeqd %xmm3, %xmm2
|
||||
add $16, %rdi
|
||||
pcmpeqd %xmm1, %xmm3
|
||||
pmovmskb %xmm2, %rcx
|
||||
pmovmskb %xmm3, %rax
|
||||
or %rax, %rcx
|
||||
jnz L(matches)
|
||||
|
||||
movdqa (%rdi), %xmm4
|
||||
pcmpeqd %xmm4, %xmm2
|
||||
add $16, %rdi
|
||||
pcmpeqd %xmm1, %xmm4
|
||||
pmovmskb %xmm2, %rcx
|
||||
pmovmskb %xmm4, %rax
|
||||
or %rax, %rcx
|
||||
jnz L(matches)
|
||||
|
||||
movdqa (%rdi), %xmm5
|
||||
pcmpeqd %xmm5, %xmm2
|
||||
add $16, %rdi
|
||||
pcmpeqd %xmm1, %xmm5
|
||||
pmovmskb %xmm2, %rcx
|
||||
pmovmskb %xmm5, %rax
|
||||
or %rax, %rcx
|
||||
jz L(loop)
|
||||
|
||||
.p2align 4
|
||||
L(matches):
|
||||
test %rax, %rax
|
||||
jnz L(match)
|
||||
L(return_value):
|
||||
test %r8, %r8
|
||||
jz L(return_null)
|
||||
mov %r8, %rax
|
||||
mov %rsi, %rdi
|
||||
|
||||
test $15 << 4, %ah
|
||||
jnz L(match_fourth_wchar)
|
||||
test %ah, %ah
|
||||
jnz L(match_third_wchar)
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%rdi), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(match):
|
||||
pmovmskb %xmm2, %rcx
|
||||
test %rcx, %rcx
|
||||
jnz L(find_zero)
|
||||
mov %rax, %r8
|
||||
mov %rdi, %rsi
|
||||
jmp L(loop)
|
||||
|
||||
.p2align 4
|
||||
L(find_zero):
|
||||
test $15, %cl
|
||||
jnz L(find_zero_in_first_wchar)
|
||||
test %cl, %cl
|
||||
jnz L(find_zero_in_second_wchar)
|
||||
test $15, %ch
|
||||
jnz L(find_zero_in_third_wchar)
|
||||
|
||||
and $1 << 13 - 1, %rax
|
||||
jz L(return_value)
|
||||
|
||||
test $15 << 4, %ah
|
||||
jnz L(match_fourth_wchar)
|
||||
test %ah, %ah
|
||||
jnz L(match_third_wchar)
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%rdi), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(find_zero_in_first_wchar):
|
||||
test $1, %rax
|
||||
jz L(return_value)
|
||||
lea -16(%rdi), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(find_zero_in_second_wchar):
|
||||
and $1 << 5 - 1, %rax
|
||||
jz L(return_value)
|
||||
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%rdi), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(find_zero_in_third_wchar):
|
||||
and $1 << 9 - 1, %rax
|
||||
jz L(return_value)
|
||||
|
||||
test %ah, %ah
|
||||
jnz L(match_third_wchar)
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%rdi), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(prolog_find_zero):
|
||||
add %rcx, %rdi
|
||||
mov %rdx, %rcx
|
||||
L(prolog_find_zero_1):
|
||||
test $15, %cl
|
||||
jnz L(prolog_find_zero_in_first_wchar)
|
||||
test %cl, %cl
|
||||
jnz L(prolog_find_zero_in_second_wchar)
|
||||
test $15, %ch
|
||||
jnz L(prolog_find_zero_in_third_wchar)
|
||||
|
||||
and $1 << 13 - 1, %rax
|
||||
jz L(return_null)
|
||||
|
||||
test $15 << 4, %ah
|
||||
jnz L(match_fourth_wchar)
|
||||
test %ah, %ah
|
||||
jnz L(match_third_wchar)
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%rdi), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(prolog_find_zero_in_first_wchar):
|
||||
test $1, %rax
|
||||
jz L(return_null)
|
||||
lea -16(%rdi), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(prolog_find_zero_in_second_wchar):
|
||||
and $1 << 5 - 1, %rax
|
||||
jz L(return_null)
|
||||
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%rdi), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(prolog_find_zero_in_third_wchar):
|
||||
and $1 << 9 - 1, %rax
|
||||
jz L(return_null)
|
||||
|
||||
test %ah, %ah
|
||||
jnz L(match_third_wchar)
|
||||
test $15 << 4, %al
|
||||
jnz L(match_second_wchar)
|
||||
lea -16(%rdi), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(match_second_wchar):
|
||||
lea -12(%rdi), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(match_third_wchar):
|
||||
lea -8(%rdi), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(match_fourth_wchar):
|
||||
lea -4(%rdi), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(return_null):
|
||||
xor %rax, %rax
|
||||
ret
|
||||
|
||||
END (wcsrchr)
|
@ -18,8 +18,11 @@
|
||||
|
||||
#include <wchar.h>
|
||||
|
||||
|
||||
/* Find the first occurrence of WC in WCS. */
|
||||
#ifdef WCSCHR
|
||||
# define wcschr WCSCHR
|
||||
#endif
|
||||
|
||||
wchar_t *
|
||||
wcschr (wcs, wc)
|
||||
register const wchar_t *wcs;
|
||||
|
Loading…
Reference in New Issue
Block a user