Optimized memcmp and wmemcmp for x86-64 and x86-32

This commit is contained in:
Liubov Dmitrieva 2011-10-15 11:10:08 -04:00 committed by Ulrich Drepper
parent 556a200797
commit be13f7bff6
19 changed files with 3069 additions and 335 deletions

View File

@ -1,3 +1,32 @@
2011-09-27 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
* sysdeps/x86_64/multiarch/Makefile: (sysdep_routines): Add
memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
* sysdeps/x86_64/multiarch/memcmp-ssse3: New file.
* sysdeps/x86_64/multiarch/memcmp.S: Update. Add __memcmp_ssse3.
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Update.
(USE_AS_WMEMCMP): New macro.
Fixing indents.
* sysdeps/x86_64/multiarch/wmemcmp.S: New file.
* sysdeps/x86_64/multiarch/wmemcmp-ssse3.S: New file.
* sysdeps/x86_64/multiarch/wmemcmp-sse4.S: New file.
* sysdeps/x86_64/multiarch/wmemcmp-c.S: New file.
* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
wmemcmp-ssse3 wmemcmp-sse4 wmemcmp-c
* sysdeps/i386/i686/multiarch/wmemcmp.S: New file.
* sysdeps/i386/i686/multiarch/wmemcmp-c.c: New file.
* sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S: New file.
* sysdeps/i386/i686/multiarch/wmemcmp-sse4.S: New file.
* sysdeps/i386/i686/multiarch/memcmp-sse4.S: Update.
(USE_AS_WMEMCMP): New macro.
* sysdeps/i386/i686/multiarch/memcmp-ssse3: Likewise.
* sysdeps/string/test-memcmp.c: Update.
Fix simple_wmemcmp.
Add new tests.
* wcsmbs/wmemcmp.c: Update.
(WMEMCMP): New macro.
Fix overflow bug.
2011-10-12 Andreas Jaeger <aj@suse.de> 2011-10-12 Andreas Jaeger <aj@suse.de>
[BZ #13268] [BZ #13268]

2
NEWS
View File

@ -33,7 +33,7 @@ Version 2.15
* Optimized strchr and strrchr for SSE on x86-32. * Optimized strchr and strrchr for SSE on x86-32.
Contributed by Liubov Dmitrieva. Contributed by Liubov Dmitrieva.
* Optimized memchr, memrchr, rawmemchr for x86-64 and x86-32. * Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp for x86-64 and x86-32.
Contributed by Liubov Dmitrieva. Contributed by Liubov Dmitrieva.
* New interfaces: scandirat, scandirat64 * New interfaces: scandirat, scandirat64

View File

@ -29,9 +29,21 @@
# define MEMCPY wmemcpy # define MEMCPY wmemcpy
# define SIMPLE_MEMCMP simple_wmemcmp # define SIMPLE_MEMCMP simple_wmemcmp
# define CHAR wchar_t # define CHAR wchar_t
# define MAX_CHAR 256000 # define UCHAR wchar_t
# define UCHAR uint32_t
# define CHARBYTES 4 # define CHARBYTES 4
# define CHAR__MIN WCHAR_MIN
# define CHAR__MAX WCHAR_MAX
int
simple_wmemcmp (const wchar_t *s1, const wchar_t *s2, size_t n)
{
int ret = 0;
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
*/
while (n-- && (ret = *s1 < *s2 ? -1 : *s1 == *s2 ? 0 : 1) == 0) {s1++; s2++;}
return ret;
}
#else #else
# define MEMCMP memcmp # define MEMCMP memcmp
# define MEMCPY memcpy # define MEMCPY memcpy
@ -40,18 +52,20 @@
# define MAX_CHAR 255 # define MAX_CHAR 255
# define UCHAR unsigned char # define UCHAR unsigned char
# define CHARBYTES 1 # define CHARBYTES 1
#endif # define CHAR__MIN CHAR_MIN
# define CHAR__MAX CHAR_MAX
typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
int int
SIMPLE_MEMCMP (const CHAR *s1, const CHAR *s2, size_t n) simple_memcmp (const char *s1, const char *s2, size_t n)
{ {
int ret = 0; int ret = 0;
while (n-- && (ret = *(UCHAR *) s1++ - *(UCHAR *) s2++) == 0); while (n-- && (ret = *(unsigned char *) s1++ - *(unsigned char *) s2++) == 0);
return ret; return ret;
} }
#endif
typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
IMPL (SIMPLE_MEMCMP, 0) IMPL (SIMPLE_MEMCMP, 0)
IMPL (MEMCMP, 1) IMPL (MEMCMP, 1)
@ -121,7 +135,7 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
s2 = (CHAR *) (buf2 + align2); s2 = (CHAR *) (buf2 + align2);
for (i = 0; i < len; i++) for (i = 0; i < len; i++)
s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR; s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX;
s1[len] = align1; s1[len] = align1;
s2[len] = align2; s2[len] = align2;
@ -412,8 +426,8 @@ check1 (void)
s2[99] = 1; s2[99] = 1;
s1[100] = 116; s1[100] = 116;
s2[100] = 116; s2[100] = 116;
s1[101] = -13; s1[101] = CHAR__MIN;
s2[101] = -13; s2[101] = CHAR__MAX;
s1[102] = -109; s1[102] = -109;
s2[102] = -109; s2[102] = -109;
s1[103] = 1; s1[103] = 1;
@ -434,8 +448,8 @@ check1 (void)
s2[110] = -109; s2[110] = -109;
s1[111] = 1; s1[111] = 1;
s2[111] = 1; s2[111] = 1;
s1[112] = 20; s1[112] = CHAR__MAX;
s2[112] = 20; s2[112] = CHAR__MIN;
s1[113] = -13; s1[113] = -13;
s2[113] = -13; s2[113] = -13;
s1[114] = -109; s1[114] = -109;
@ -444,9 +458,12 @@ check1 (void)
s2[115] = 1; s2[115] = 1;
n = 116; n = 116;
exp_result = SIMPLE_MEMCMP (s1, s2, n); for (size_t i = 0; i < n; i++)
FOR_EACH_IMPL (impl, 0) {
check_result (impl, s1, s2, n, exp_result); exp_result = SIMPLE_MEMCMP (s1 + i, s2 + i, n - i);
FOR_EACH_IMPL (impl, 0)
check_result (impl, s1 + i, s2 + i, n - i, exp_result);
}
} }
int int

View File

@ -17,7 +17,8 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \ strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \ wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \
memrchr-sse2 memrchr-sse2-bsf memrchr-c \ memrchr-sse2 memrchr-sse2-bsf memrchr-c \
rawmemchr-sse2 rawmemchr-sse2-bsf rawmemchr-sse2 rawmemchr-sse2-bsf \
wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
ifeq (yes,$(config-cflags-sse4)) ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-varshift.c += -msse4 CFLAGS-varshift.c += -msse4

View File

@ -1,5 +1,5 @@
/* memcmp with SSE4.2 /* memcmp with SSE4.2, wmemcmp with SSE4.2
Copyright (C) 2010 Free Software Foundation, Inc. Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation. Contributed by Intel Corporation.
This file is part of the GNU C Library. This file is part of the GNU C Library.
@ -20,84 +20,97 @@
#ifndef NOT_IN_libc #ifndef NOT_IN_libc
#include <sysdep.h> # include <sysdep.h>
#include "asm-syntax.h"
#ifndef MEMCMP # ifndef MEMCMP
# define MEMCMP __memcmp_sse4_2 # define MEMCMP __memcmp_sse4_2
#endif # endif
#define CFI_PUSH(REG) \ # define CFI_PUSH(REG) \
cfi_adjust_cfa_offset (4); \ cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0) cfi_rel_offset (REG, 0)
#define CFI_POP(REG) \ # define CFI_POP(REG) \
cfi_adjust_cfa_offset (-4); \ cfi_adjust_cfa_offset (-4); \
cfi_restore (REG) cfi_restore (REG)
#define PUSH(REG) pushl REG; CFI_PUSH (REG) # define PUSH(REG) pushl REG; CFI_PUSH (REG)
#define POP(REG) popl REG; CFI_POP (REG) # define POP(REG) popl REG; CFI_POP (REG)
#define PARMS 4 # define PARMS 4
#define BLK1 PARMS # define BLK1 PARMS
#define BLK2 BLK1+4 # define BLK2 BLK1 + 4
#define LEN BLK2+4 # define LEN BLK2 + 4
#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) # define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
#ifdef SHARED # ifdef SHARED
# define JMPTBL(I, B) I - B # define JMPTBL(I, B) I - B
/* Load an entry in a jump table into EBX and branch to it. TABLE is a /* Load an entry in a jump table into EBX and branch to it. TABLE is a
jump table with relative offsets. INDEX is a register contains the jump table with relative offsets. INDEX is a register contains the
index into the jump table. SCALE is the scale of INDEX. */ index into the jump table. SCALE is the scale of INDEX. */
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
/* We first load PC into EBX. */ \
call __i686.get_pc_thunk.bx; \
/* Get the address of the jump table. */ \
addl $(TABLE - .), %ebx; \
/* Get the entry and convert the relative offset to the \
absolute address. */ \
addl (%ebx,INDEX,SCALE), %ebx; \
/* We loaded the jump table and adjuested EDX/ESI. Go. */ \
jmp *%ebx
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
.globl __i686.get_pc_thunk.bx /* We first load PC into EBX. */ \
.hidden __i686.get_pc_thunk.bx call __i686.get_pc_thunk.bx; \
ALIGN (4) /* Get the address of the jump table. */ \
.type __i686.get_pc_thunk.bx,@function addl $(TABLE - .), %ebx; \
__i686.get_pc_thunk.bx: /* Get the entry and convert the relative offset to the \
movl (%esp), %ebx absolute address. */ \
ret addl (%ebx,INDEX,SCALE), %ebx; \
#else /* We loaded the jump table and adjuested EDX/ESI. Go. */ \
# define JMPTBL(I, B) I jmp *%ebx
# else
# define JMPTBL(I, B) I
/* Load an entry in a jump table into EBX and branch to it. TABLE is a /* Load an entry in a jump table into EBX and branch to it. TABLE is a
jump table with relative offsets. INDEX is a register contains the jump table with relative offsets. INDEX is a register contains the
index into the jump table. SCALE is the scale of INDEX. */ index into the jump table. SCALE is the scale of INDEX. */
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
jmp *TABLE(,INDEX,SCALE) jmp *TABLE(,INDEX,SCALE)
#endif # endif
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
*/
.section .text.sse4.2,"ax",@progbits .section .text.sse4.2,"ax",@progbits
ENTRY (MEMCMP) ENTRY (MEMCMP)
movl BLK1(%esp), %eax movl BLK1(%esp), %eax
movl BLK2(%esp), %edx movl BLK2(%esp), %edx
movl LEN(%esp), %ecx movl LEN(%esp), %ecx
# ifdef USE_AS_WMEMCMP
shl $2, %ecx
test %ecx, %ecx
jz L(return0)
# else
cmp $1, %ecx cmp $1, %ecx
jbe L(less1bytes) jbe L(less1bytes)
# endif
pxor %xmm0, %xmm0 pxor %xmm0, %xmm0
cmp $64, %ecx cmp $64, %ecx
ja L(64bytesormore) ja L(64bytesormore)
cmp $8, %ecx cmp $8, %ecx
PUSH (%ebx)
# ifndef USE_AS_WMEMCMP
PUSH (%ebx)
jb L(less8bytes) jb L(less8bytes)
# else
jb L(less8bytes)
PUSH (%ebx)
# endif
add %ecx, %edx add %ecx, %edx
add %ecx, %eax add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
ALIGN (4) # ifndef USE_AS_WMEMCMP
.p2align 4
L(less8bytes): L(less8bytes):
mov (%eax), %bl mov (%eax), %bl
cmpb (%edx), %bl cmpb (%edx), %bl
@ -141,22 +154,49 @@ L(less8bytes):
mov 6(%eax), %bl mov 6(%eax), %bl
cmpb 6(%edx), %bl cmpb 6(%edx), %bl
je L(0bytes) je L(0bytes)
L(nonzero): L(nonzero):
POP (%ebx) POP (%ebx)
mov $1, %eax mov $1, %eax
ja L(above) ja L(above)
neg %eax neg %eax
L(above): L(above):
ret ret
CFI_PUSH (%ebx) CFI_PUSH (%ebx)
# endif
ALIGN (4) .p2align 4
L(0bytes): L(0bytes):
POP (%ebx) POP (%ebx)
xor %eax, %eax xor %eax, %eax
ret ret
ALIGN (4) # ifdef USE_AS_WMEMCMP
/* for wmemcmp, case N == 1 */
.p2align 4
L(less8bytes):
mov (%eax), %ecx
cmp (%edx), %ecx
je L(return0)
mov $1, %eax
jg L(find_diff_bigger)
neg %eax
ret
.p2align 4
L(find_diff_bigger):
ret
.p2align 4
L(return0):
xor %eax, %eax
ret
# endif
# ifndef USE_AS_WMEMCMP
.p2align 4
L(less1bytes): L(less1bytes):
jb L(0bytesend) jb L(0bytesend)
movzbl (%eax), %eax movzbl (%eax), %eax
@ -164,14 +204,14 @@ L(less1bytes):
sub %edx, %eax sub %edx, %eax
ret ret
ALIGN (4) .p2align 4
L(0bytesend): L(0bytesend):
xor %eax, %eax xor %eax, %eax
ret ret
# endif
ALIGN (4) .p2align 4
L(64bytesormore): L(64bytesormore):
PUSH (%ebx) PUSH (%ebx)
mov %ecx, %ebx mov %ecx, %ebx
mov $64, %ecx mov $64, %ecx
sub $64, %ebx sub $64, %ebx
@ -208,7 +248,14 @@ L(64bytesormore_loop):
add %ecx, %eax add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
ALIGN (4) # ifdef USE_AS_WMEMCMP
/* Label needs only for table_64bytes filling */
L(unreal_case):
/* no code here */
# endif
.p2align 4
L(find_16diff): L(find_16diff):
sub $16, %ecx sub $16, %ecx
L(find_32diff): L(find_32diff):
@ -218,9 +265,9 @@ L(find_48diff):
L(find_64diff): L(find_64diff):
add %ecx, %edx add %ecx, %edx
add %ecx, %eax add %ecx, %eax
jmp L(16bytes)
ALIGN (4) # ifndef USE_AS_WMEMCMP
.p2align 4
L(16bytes): L(16bytes):
mov -16(%eax), %ecx mov -16(%eax), %ecx
mov -16(%edx), %ebx mov -16(%edx), %ebx
@ -243,8 +290,30 @@ L(4bytes):
mov $0, %eax mov $0, %eax
jne L(find_diff) jne L(find_diff)
RETURN RETURN
# else
.p2align 4
L(16bytes):
mov -16(%eax), %ecx
cmp -16(%edx), %ecx
jne L(find_diff)
L(12bytes):
mov -12(%eax), %ecx
cmp -12(%edx), %ecx
jne L(find_diff)
L(8bytes):
mov -8(%eax), %ecx
cmp -8(%edx), %ecx
jne L(find_diff)
L(4bytes):
mov -4(%eax), %ecx
cmp -4(%edx), %ecx
mov $0, %eax
jne L(find_diff)
RETURN
# endif
ALIGN (4) # ifndef USE_AS_WMEMCMP
.p2align 4
L(49bytes): L(49bytes):
movdqu -49(%eax), %xmm1 movdqu -49(%eax), %xmm1
movdqu -49(%edx), %xmm2 movdqu -49(%edx), %xmm2
@ -285,7 +354,7 @@ L(5bytes):
jne L(end) jne L(end)
RETURN RETURN
ALIGN (4) .p2align 4
L(50bytes): L(50bytes):
mov $-50, %ebx mov $-50, %ebx
movdqu -50(%eax), %xmm1 movdqu -50(%eax), %xmm1
@ -330,7 +399,7 @@ L(2bytes):
jne L(end) jne L(end)
RETURN RETURN
ALIGN (4) .p2align 4
L(51bytes): L(51bytes):
mov $-51, %ebx mov $-51, %ebx
movdqu -51(%eax), %xmm1 movdqu -51(%eax), %xmm1
@ -378,8 +447,8 @@ L(1bytes):
mov $0, %eax mov $0, %eax
jne L(end) jne L(end)
RETURN RETURN
# endif
ALIGN (4) .p2align 4
L(52bytes): L(52bytes):
movdqu -52(%eax), %xmm1 movdqu -52(%eax), %xmm1
movdqu -52(%edx), %xmm2 movdqu -52(%edx), %xmm2
@ -402,13 +471,18 @@ L(20bytes):
ptest %xmm2, %xmm0 ptest %xmm2, %xmm0
jnc L(less16bytes) jnc L(less16bytes)
mov -4(%eax), %ecx mov -4(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx mov -4(%edx), %ebx
cmp %ebx, %ecx cmp %ebx, %ecx
# else
cmp -4(%edx), %ecx
# endif
mov $0, %eax mov $0, %eax
jne L(find_diff) jne L(find_diff)
RETURN RETURN
ALIGN (4) # ifndef USE_AS_WMEMCMP
.p2align 4
L(53bytes): L(53bytes):
movdqu -53(%eax), %xmm1 movdqu -53(%eax), %xmm1
movdqu -53(%edx), %xmm2 movdqu -53(%edx), %xmm2
@ -440,7 +514,7 @@ L(21bytes):
jne L(end) jne L(end)
RETURN RETURN
ALIGN (4) .p2align 4
L(54bytes): L(54bytes):
movdqu -54(%eax), %xmm1 movdqu -54(%eax), %xmm1
movdqu -54(%edx), %xmm2 movdqu -54(%edx), %xmm2
@ -476,7 +550,7 @@ L(22bytes):
jne L(end) jne L(end)
RETURN RETURN
ALIGN (4) .p2align 4
L(55bytes): L(55bytes):
movdqu -55(%eax), %xmm1 movdqu -55(%eax), %xmm1
movdqu -55(%edx), %xmm2 movdqu -55(%edx), %xmm2
@ -513,8 +587,8 @@ L(23bytes):
mov $0, %eax mov $0, %eax
jne L(end) jne L(end)
RETURN RETURN
# endif
ALIGN (4) .p2align 4
L(56bytes): L(56bytes):
movdqu -56(%eax), %xmm1 movdqu -56(%eax), %xmm1
movdqu -56(%edx), %xmm2 movdqu -56(%edx), %xmm2
@ -538,18 +612,27 @@ L(24bytes):
jnc L(less16bytes) jnc L(less16bytes)
mov -8(%eax), %ecx mov -8(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx mov -8(%edx), %ebx
cmp %ebx, %ecx cmp %ebx, %ecx
# else
cmp -8(%edx), %ecx
# endif
jne L(find_diff) jne L(find_diff)
mov -4(%eax), %ecx mov -4(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx mov -4(%edx), %ebx
cmp %ebx, %ecx cmp %ebx, %ecx
# else
cmp -4(%edx), %ecx
# endif
mov $0, %eax mov $0, %eax
jne L(find_diff) jne L(find_diff)
RETURN RETURN
ALIGN (4) # ifndef USE_AS_WMEMCMP
.p2align 4
L(57bytes): L(57bytes):
movdqu -57(%eax), %xmm1 movdqu -57(%eax), %xmm1
movdqu -57(%edx), %xmm2 movdqu -57(%edx), %xmm2
@ -585,7 +668,7 @@ L(25bytes):
jne L(end) jne L(end)
RETURN RETURN
ALIGN (4) .p2align 4
L(58bytes): L(58bytes):
movdqu -58(%eax), %xmm1 movdqu -58(%eax), %xmm1
movdqu -58(%edx), %xmm2 movdqu -58(%edx), %xmm2
@ -627,7 +710,7 @@ L(26bytes):
jne L(end) jne L(end)
RETURN RETURN
ALIGN (4) .p2align 4
L(59bytes): L(59bytes):
movdqu -59(%eax), %xmm1 movdqu -59(%eax), %xmm1
movdqu -59(%edx), %xmm2 movdqu -59(%edx), %xmm2
@ -668,8 +751,8 @@ L(27bytes):
mov $0, %eax mov $0, %eax
jne L(end) jne L(end)
RETURN RETURN
# endif
ALIGN (4) .p2align 4
L(60bytes): L(60bytes):
movdqu -60(%eax), %xmm1 movdqu -60(%eax), %xmm1
movdqu -60(%edx), %xmm2 movdqu -60(%edx), %xmm2
@ -691,22 +774,38 @@ L(28bytes):
pxor %xmm1, %xmm2 pxor %xmm1, %xmm2
ptest %xmm2, %xmm0 ptest %xmm2, %xmm0
jnc L(less16bytes) jnc L(less16bytes)
mov -12(%eax), %ecx mov -12(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -12(%edx), %ebx mov -12(%edx), %ebx
cmp %ebx, %ecx cmp %ebx, %ecx
# else
cmp -12(%edx), %ecx
# endif
jne L(find_diff) jne L(find_diff)
mov -8(%eax), %ecx mov -8(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx mov -8(%edx), %ebx
cmp %ebx, %ecx cmp %ebx, %ecx
# else
cmp -8(%edx), %ecx
# endif
jne L(find_diff) jne L(find_diff)
mov -4(%eax), %ecx mov -4(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx mov -4(%edx), %ebx
cmp %ebx, %ecx cmp %ebx, %ecx
# else
cmp -4(%edx), %ecx
# endif
mov $0, %eax mov $0, %eax
jne L(find_diff) jne L(find_diff)
RETURN RETURN
ALIGN (4) # ifndef USE_AS_WMEMCMP
.p2align 4
L(61bytes): L(61bytes):
movdqu -61(%eax), %xmm1 movdqu -61(%eax), %xmm1
movdqu -61(%edx), %xmm2 movdqu -61(%edx), %xmm2
@ -749,7 +848,7 @@ L(29bytes):
jne L(end) jne L(end)
RETURN RETURN
ALIGN (4) .p2align 4
L(62bytes): L(62bytes):
movdqu -62(%eax), %xmm1 movdqu -62(%eax), %xmm1
movdqu -62(%edx), %xmm2 movdqu -62(%edx), %xmm2
@ -792,7 +891,7 @@ L(30bytes):
jne L(end) jne L(end)
RETURN RETURN
ALIGN (4) .p2align 4
L(63bytes): L(63bytes):
movdqu -63(%eax), %xmm1 movdqu -63(%eax), %xmm1
movdqu -63(%edx), %xmm2 movdqu -63(%edx), %xmm2
@ -838,8 +937,9 @@ L(31bytes):
mov $0, %eax mov $0, %eax
jne L(end) jne L(end)
RETURN RETURN
# endif
ALIGN (4) .p2align 4
L(64bytes): L(64bytes):
movdqu -64(%eax), %xmm1 movdqu -64(%eax), %xmm1
movdqu -64(%edx), %xmm2 movdqu -64(%edx), %xmm2
@ -863,28 +963,45 @@ L(32bytes):
jnc L(less16bytes) jnc L(less16bytes)
mov -16(%eax), %ecx mov -16(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -16(%edx), %ebx mov -16(%edx), %ebx
cmp %ebx, %ecx cmp %ebx, %ecx
# else
cmp -16(%edx), %ecx
# endif
jne L(find_diff) jne L(find_diff)
mov -12(%eax), %ecx mov -12(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -12(%edx), %ebx mov -12(%edx), %ebx
cmp %ebx, %ecx cmp %ebx, %ecx
# else
cmp -12(%edx), %ecx
# endif
jne L(find_diff) jne L(find_diff)
mov -8(%eax), %ecx mov -8(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx mov -8(%edx), %ebx
cmp %ebx, %ecx cmp %ebx, %ecx
# else
cmp -8(%edx), %ecx
# endif
jne L(find_diff) jne L(find_diff)
mov -4(%eax), %ecx mov -4(%eax), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx mov -4(%edx), %ebx
cmp %ebx, %ecx cmp %ebx, %ecx
# else
cmp -4(%edx), %ecx
# endif
mov $0, %eax mov $0, %eax
jne L(find_diff) jne L(find_diff)
RETURN RETURN
ALIGN (4) # ifndef USE_AS_WMEMCMP
.p2align 4
L(less16bytes): L(less16bytes):
add %ebx, %eax add %ebx, %eax
add %ebx, %edx add %ebx, %edx
@ -910,9 +1027,35 @@ L(less16bytes):
mov $0, %eax mov $0, %eax
jne L(find_diff) jne L(find_diff)
RETURN RETURN
# else
.p2align 4
L(less16bytes):
add %ebx, %eax
add %ebx, %edx
ALIGN (4) mov (%eax), %ecx
cmp (%edx), %ecx
jne L(find_diff)
mov 4(%eax), %ecx
cmp 4(%edx), %ecx
jne L(find_diff)
mov 8(%eax), %ecx
cmp 8(%edx), %ecx
jne L(find_diff)
mov 12(%eax), %ecx
cmp 12(%edx), %ecx
mov $0, %eax
jne L(find_diff)
RETURN
# endif
.p2align 4
L(find_diff): L(find_diff):
# ifndef USE_AS_WMEMCMP
cmpb %bl, %cl cmpb %bl, %cl
jne L(end) jne L(end)
cmp %bx, %cx cmp %bx, %cx
@ -923,17 +1066,29 @@ L(find_diff):
jne L(end) jne L(end)
cmp %bx, %cx cmp %bx, %cx
L(end): L(end):
POP (%ebx) POP (%ebx)
mov $1, %eax mov $1, %eax
ja L(bigger) ja L(bigger)
neg %eax neg %eax
L(bigger): L(bigger):
ret ret
# else
POP (%ebx)
mov $1, %eax
jg L(bigger)
neg %eax
ret
.p2align 4
L(bigger):
ret
# endif
END (MEMCMP) END (MEMCMP)
.section .rodata.sse4.2,"a",@progbits .section .rodata.sse4.2,"a",@progbits
ALIGN (2) .p2align 2
.type L(table_64bytes), @object .type L(table_64bytes), @object
# ifndef USE_AS_WMEMCMP
L(table_64bytes): L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes)) .int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(1bytes), L(table_64bytes)) .int JMPTBL (L(1bytes), L(table_64bytes))
@ -1000,5 +1155,72 @@ L(table_64bytes):
.int JMPTBL (L(62bytes), L(table_64bytes)) .int JMPTBL (L(62bytes), L(table_64bytes))
.int JMPTBL (L(63bytes), L(table_64bytes)) .int JMPTBL (L(63bytes), L(table_64bytes))
.int JMPTBL (L(64bytes), L(table_64bytes)) .int JMPTBL (L(64bytes), L(table_64bytes))
.size L(table_64bytes), .-L(table_64bytes) # else
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(4bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(8bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(12bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(16bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(20bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(24bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(28bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(32bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(36bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(40bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(44bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(48bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(52bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(56bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(60bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(64bytes), L(table_64bytes))
# endif
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,5 @@
#ifndef NOT_IN_libc
# define WMEMCMP __wmemcmp_ia32
#endif
#include "wcsmbs/wmemcmp.c"

View File

@ -0,0 +1,4 @@
#define USE_AS_WMEMCMP 1
#define MEMCMP __wmemcmp_sse4_2
#include "memcmp-sse4.S"

View File

@ -0,0 +1,4 @@
#define USE_AS_WMEMCMP 1
#define MEMCMP __wmemcmp_ssse3
#include "memcmp-ssse3.S"

View File

@ -0,0 +1,59 @@
/* Multiple versions of wmemcmp
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
/* Define multiple versions only for the definition in libc. */
#ifndef NOT_IN_libc
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
.globl __i686.get_pc_thunk.bx
.hidden __i686.get_pc_thunk.bx
.p2align 4
.type __i686.get_pc_thunk.bx,@function
__i686.get_pc_thunk.bx:
movl (%esp), %ebx
ret
.text
ENTRY(wmemcmp)
.type wmemcmp, @gnu_indirect_function
pushl %ebx
cfi_adjust_cfa_offset (4)
cfi_rel_offset (ebx, 0)
call __i686.get_pc_thunk.bx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
jne 1f
call __init_cpu_features
1: leal __wmemcmp_ia32@GOTOFF(%ebx), %eax
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __wmemcmp_ssse3@GOTOFF(%ebx), %eax
testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __wmemcmp_sse4_2@GOTOFF(%ebx), %eax
2: popl %ebx
cfi_adjust_cfa_offset (-4)
cfi_restore (ebx)
ret
END(wmemcmp)
#endif

View File

@ -15,7 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \
strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \ strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
strrchr-sse2-no-bsf strchr-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
ifeq (yes,$(config-cflags-sse4)) ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4 CFLAGS-varshift.c += -msse4

View File

@ -1,5 +1,5 @@
/* memcmp with SSE4.1 /* memcmp with SSE4.1, wmemcmp with SSE4.1
Copyright (C) 2010 Free Software Foundation, Inc. Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation. Contributed by Intel Corporation.
This file is part of the GNU C Library. This file is part of the GNU C Library.
@ -20,43 +20,54 @@
#ifndef NOT_IN_libc #ifndef NOT_IN_libc
#include <sysdep.h> # include <sysdep.h>
#include "asm-syntax.h"
#ifndef MEMCMP # ifndef MEMCMP
# define MEMCMP __memcmp_sse4_1 # define MEMCMP __memcmp_sse4_1
#endif # endif
#ifndef ALIGN # ifndef ALIGN
# define ALIGN(n) .p2align n # define ALIGN(n) .p2align n
#endif # endif
#define JMPTBL(I, B) (I - B) # define JMPTBL(I, B) (I - B)
#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
lea TABLE(%rip), %r11; \ lea TABLE(%rip), %r11; \
movslq (%r11, INDEX, SCALE), %rcx; \ movslq (%r11, INDEX, SCALE), %rcx; \
add %r11, %rcx; \ add %r11, %rcx; \
jmp *%rcx; \ jmp *%rcx; \
ud2 ud2
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
*/
.section .text.sse4.1,"ax",@progbits .section .text.sse4.1,"ax",@progbits
ENTRY (MEMCMP) ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
shl $2, %rdx
# endif
pxor %xmm0, %xmm0 pxor %xmm0, %xmm0
cmp $79, %rdx cmp $79, %rdx
ja L(79bytesormore) ja L(79bytesormore)
# ifndef USE_AS_WMEMCMP
cmp $1, %rdx cmp $1, %rdx
je L(firstbyte) je L(firstbyte)
# endif
add %rdx, %rsi add %rdx, %rsi
add %rdx, %rdi add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
# ifndef USE_AS_WMEMCMP
ALIGN (4) ALIGN (4)
L(firstbyte): L(firstbyte):
movzbl (%rdi), %eax movzbl (%rdi), %eax
movzbl (%rsi), %ecx movzbl (%rsi), %ecx
sub %ecx, %eax sub %ecx, %eax
ret ret
# endif
ALIGN (4) ALIGN (4)
L(79bytesormore): L(79bytesormore):
@ -308,11 +319,11 @@ L(less32bytesin256):
ALIGN (4) ALIGN (4)
L(512bytesormore): L(512bytesormore):
#ifdef DATA_CACHE_SIZE_HALF # ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %r8 mov $DATA_CACHE_SIZE_HALF, %r8
#else # else
mov __x86_64_data_cache_size_half(%rip), %r8 mov __x86_64_data_cache_size_half(%rip), %r8
#endif # endif
mov %r8, %r9 mov %r8, %r9
shr $1, %r8 shr $1, %r8
add %r9, %r8 add %r9, %r8
@ -624,11 +635,11 @@ L(less32bytesin256in2alinged):
ALIGN (4) ALIGN (4)
L(512bytesormorein2aligned): L(512bytesormorein2aligned):
#ifdef DATA_CACHE_SIZE_HALF # ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %r8 mov $DATA_CACHE_SIZE_HALF, %r8
#else # else
mov __x86_64_data_cache_size_half(%rip), %r8 mov __x86_64_data_cache_size_half(%rip), %r8
#endif # endif
mov %r8, %r9 mov %r8, %r9
shr $1, %r8 shr $1, %r8
add %r9, %r8 add %r9, %r8
@ -667,6 +678,7 @@ L(64bytesormore_loopin2aligned):
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
L(L2_L3_cache_aglined): L(L2_L3_cache_aglined):
sub $64, %rdx sub $64, %rdx
ALIGN (4) ALIGN (4)
L(L2_L3_aligned_128bytes_loop): L(L2_L3_aligned_128bytes_loop):
prefetchnta 0x1c0(%rdi) prefetchnta 0x1c0(%rdi)
@ -803,13 +815,19 @@ L(12bytes):
jne L(diffin8bytes) jne L(diffin8bytes)
L(4bytes): L(4bytes):
mov -4(%rsi), %ecx mov -4(%rsi), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%rdi), %eax mov -4(%rdi), %eax
cmp %eax, %ecx cmp %eax, %ecx
# else
cmp -4(%rdi), %ecx
# endif
jne L(diffin4bytes) jne L(diffin4bytes)
L(0bytes): L(0bytes):
xor %eax, %eax xor %eax, %eax
ret ret
# ifndef USE_AS_WMEMCMP
/* unreal case for wmemcmp */
ALIGN (4) ALIGN (4)
L(65bytes): L(65bytes):
movdqu -65(%rdi), %xmm1 movdqu -65(%rdi), %xmm1
@ -1017,6 +1035,7 @@ L(1bytes):
movzbl -1(%rsi), %ecx movzbl -1(%rsi), %ecx
sub %ecx, %eax sub %ecx, %eax
ret ret
# endif
ALIGN (4) ALIGN (4)
L(68bytes): L(68bytes):
@ -1047,13 +1066,20 @@ L(20bytes):
pxor %xmm1, %xmm2 pxor %xmm1, %xmm2
ptest %xmm2, %xmm0 ptest %xmm2, %xmm0
jnc L(less16bytes) jnc L(less16bytes)
mov -4(%rdi), %eax
mov -4(%rsi), %ecx mov -4(%rsi), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%rdi), %eax
cmp %eax, %ecx cmp %eax, %ecx
# else
cmp -4(%rdi), %ecx
# endif
jne L(diffin4bytes) jne L(diffin4bytes)
xor %eax, %eax xor %eax, %eax
ret ret
# ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */
ALIGN (4) ALIGN (4)
L(69bytes): L(69bytes):
movdqu -69(%rsi), %xmm1 movdqu -69(%rsi), %xmm1
@ -1161,6 +1187,7 @@ L(23bytes):
jne L(diffin8bytes) jne L(diffin8bytes)
xor %eax, %eax xor %eax, %eax
ret ret
# endif
ALIGN (4) ALIGN (4)
L(72bytes): L(72bytes):
@ -1191,13 +1218,16 @@ L(24bytes):
pxor %xmm1, %xmm2 pxor %xmm1, %xmm2
ptest %xmm2, %xmm0 ptest %xmm2, %xmm0
jnc L(less16bytes) jnc L(less16bytes)
mov -8(%rdi), %rax
mov -8(%rsi), %rcx mov -8(%rsi), %rcx
mov -8(%rdi), %rax
cmp %rax, %rcx cmp %rax, %rcx
jne L(diffin8bytes) jne L(diffin8bytes)
xor %eax, %eax xor %eax, %eax
ret ret
# ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */
ALIGN (4) ALIGN (4)
L(73bytes): L(73bytes):
movdqu -73(%rsi), %xmm1 movdqu -73(%rsi), %xmm1
@ -1312,7 +1342,7 @@ L(27bytes):
jne L(diffin4bytes) jne L(diffin4bytes)
xor %eax, %eax xor %eax, %eax
ret ret
# endif
ALIGN (4) ALIGN (4)
L(76bytes): L(76bytes):
movdqu -76(%rsi), %xmm1 movdqu -76(%rsi), %xmm1
@ -1346,13 +1376,19 @@ L(28bytes):
mov -12(%rsi), %rcx mov -12(%rsi), %rcx
cmp %rax, %rcx cmp %rax, %rcx
jne L(diffin8bytes) jne L(diffin8bytes)
mov -4(%rdi), %eax
mov -4(%rsi), %ecx mov -4(%rsi), %ecx
# ifndef USE_AS_WMEMCMP
mov -4(%rdi), %eax
cmp %eax, %ecx cmp %eax, %ecx
# else
cmp -4(%rdi), %ecx
# endif
jne L(diffin4bytes) jne L(diffin4bytes)
xor %eax, %eax xor %eax, %eax
ret ret
# ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */
ALIGN (4) ALIGN (4)
L(77bytes): L(77bytes):
movdqu -77(%rsi), %xmm1 movdqu -77(%rsi), %xmm1
@ -1474,7 +1510,7 @@ L(31bytes):
jne L(diffin8bytes) jne L(diffin8bytes)
xor %eax, %eax xor %eax, %eax
ret ret
# endif
ALIGN (4) ALIGN (4)
L(64bytes): L(64bytes):
movdqu -64(%rdi), %xmm2 movdqu -64(%rdi), %xmm2
@ -1527,7 +1563,17 @@ L(diffin8bytes):
jne L(diffin4bytes) jne L(diffin4bytes)
shr $32, %rcx shr $32, %rcx
shr $32, %rax shr $32, %rax
# ifdef USE_AS_WMEMCMP
/* for wmemcmp */
cmp %eax, %ecx
jne L(diffin4bytes)
xor %eax, %eax
ret
# endif
L(diffin4bytes): L(diffin4bytes):
# ifndef USE_AS_WMEMCMP
cmp %cx, %ax cmp %cx, %ax
jne L(diffin2bytes) jne L(diffin2bytes)
shr $16, %ecx shr $16, %ecx
@ -1546,11 +1592,28 @@ L(end):
and $0xff, %ecx and $0xff, %ecx
sub %ecx, %eax sub %ecx, %eax
ret ret
# else
/* for wmemcmp */
mov $1, %eax
jl L(nequal_bigger)
neg %eax
ret
ALIGN (4)
L(nequal_bigger):
ret
L(unreal_case):
xor %eax, %eax
ret
# endif
END (MEMCMP) END (MEMCMP)
.section .rodata.sse4.1,"a",@progbits .section .rodata.sse4.1,"a",@progbits
ALIGN (3) ALIGN (3)
# ifndef USE_AS_WMEMCMP
L(table_64bytes): L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes)) .int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(1bytes), L(table_64bytes)) .int JMPTBL (L(1bytes), L(table_64bytes))
@ -1632,4 +1695,87 @@ L(table_64bytes):
.int JMPTBL (L(77bytes), L(table_64bytes)) .int JMPTBL (L(77bytes), L(table_64bytes))
.int JMPTBL (L(78bytes), L(table_64bytes)) .int JMPTBL (L(78bytes), L(table_64bytes))
.int JMPTBL (L(79bytes), L(table_64bytes)) .int JMPTBL (L(79bytes), L(table_64bytes))
# else
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(4bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(8bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(12bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(16bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(20bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(24bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(28bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(32bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(36bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(40bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(44bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(48bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(52bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(56bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(60bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(64bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(68bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(72bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(76bytes), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
.int JMPTBL (L(unreal_case), L(table_64bytes))
# endif
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/* Multiple versions of memcmp /* Multiple versions of memcmp
Copyright (C) 2010 Free Software Foundation, Inc. Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation. Contributed by Intel Corporation.
This file is part of the GNU C Library. This file is part of the GNU C Library.
@ -29,11 +29,20 @@ ENTRY(memcmp)
cmpl $0, KIND_OFFSET+__cpu_features(%rip) cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f jne 1f
call __init_cpu_features call __init_cpu_features
1: leaq __memcmp_sse2(%rip), %rax
testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) 1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jz 2f jnz 2f
leaq __memcmp_sse2(%rip), %rax
ret
2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
jz 3f
leaq __memcmp_sse4_1(%rip), %rax leaq __memcmp_sse4_1(%rip), %rax
2: ret ret
3: leaq __memcmp_ssse3(%rip), %rax
ret
END(memcmp) END(memcmp)
# undef ENTRY # undef ENTRY

View File

@ -0,0 +1,5 @@
#ifndef NOT_IN_libc
# define WMEMCMP __wmemcmp_sse2
#endif
#include "wcsmbs/wmemcmp.c"

View File

@ -0,0 +1,4 @@
#define USE_AS_WMEMCMP 1
#define MEMCMP __wmemcmp_sse4_1
#include "memcmp-sse4.S"

View File

@ -0,0 +1,4 @@
#define USE_AS_WMEMCMP 1
#define MEMCMP __wmemcmp_ssse3
#include "memcmp-ssse3.S"

View File

@ -0,0 +1,47 @@
/* Multiple versions of wmemcmp
Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <init-arch.h>
/* Define multiple versions only for the definition in libc. */
#ifndef NOT_IN_libc
.text
ENTRY(wmemcmp)
.type wmemcmp, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jnz 2f
leaq __wmemcmp_sse2(%rip), %rax
ret
2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
jz 3f
leaq __wmemcmp_sse4_1(%rip), %rax
ret
3: leaq __wmemcmp_ssse3(%rip), %rax
ret
END(wmemcmp)
#endif

View File

@ -1,4 +1,4 @@
/* Copyright (C) 1996, 1997 Free Software Foundation, Inc. /* Copyright (C) 1996, 1997i, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library. This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996. Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
@ -19,9 +19,12 @@
#include <wchar.h> #include <wchar.h>
#ifndef WMEMCMP
# define wmemcmp
#endif
int int
wmemcmp (s1, s2, n) WMEMCMP (s1, s2, n)
const wchar_t *s1; const wchar_t *s1;
const wchar_t *s2; const wchar_t *s2;
size_t n; size_t n;
@ -34,19 +37,19 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0]; c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0]; c2 = (wint_t) s2[0];
if (c1 - c2 != 0) if (c1 - c2 != 0)
return c1 - c2; return c1 > c2 ? 1 : -1;
c1 = (wint_t) s1[1]; c1 = (wint_t) s1[1];
c2 = (wint_t) s2[1]; c2 = (wint_t) s2[1];
if (c1 - c2 != 0) if (c1 - c2 != 0)
return c1 - c2; return c1 > c2 ? 1 : -1;
c1 = (wint_t) s1[2]; c1 = (wint_t) s1[2];
c2 = (wint_t) s2[2]; c2 = (wint_t) s2[2];
if (c1 - c2 != 0) if (c1 - c2 != 0)
return c1 - c2; return c1 > c2 ? 1 : -1;
c1 = (wint_t) s1[3]; c1 = (wint_t) s1[3];
c2 = (wint_t) s2[3]; c2 = (wint_t) s2[3];
if (c1 - c2 != 0) if (c1 - c2 != 0)
return c1 - c2; return c1 > c2 ? 1 : -1;
s1 += 4; s1 += 4;
s2 += 4; s2 += 4;
n -= 4; n -= 4;
@ -57,7 +60,7 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0]; c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0]; c2 = (wint_t) s2[0];
if (c1 - c2 != 0) if (c1 - c2 != 0)
return c1 - c2; return c1 > c2 ? 1 : -1;
++s1; ++s1;
++s2; ++s2;
--n; --n;
@ -67,7 +70,7 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0]; c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0]; c2 = (wint_t) s2[0];
if (c1 - c2 != 0) if (c1 - c2 != 0)
return c1 - c2; return c1 > c2 ? 1 : -1;
++s1; ++s1;
++s2; ++s2;
--n; --n;
@ -77,7 +80,7 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0]; c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0]; c2 = (wint_t) s2[0];
if (c1 - c2 != 0) if (c1 - c2 != 0)
return c1 - c2; return c1 > c2 ? 1 : -1;
} }
return 0; return 0;