Add strstr with unaligned loads. Fixes bug 12100.

A sse42 version of strstr used pcmpistr instruction which is quite
ineffective. A faster way is look for pairs of characters which is uses
sse2, is faster than pcmpistr and for real strings a pairs we look for
are relatively rare.

For linear time complexity we use buy or rent technique which switches
to two-way algorithm when superlinear behaviour is detected.
This commit is contained in:
Ondřej Bílka 2013-12-14 19:33:56 +01:00
parent 8a5c7897dd
commit 584b18eb4d
10 changed files with 440 additions and 505 deletions

View File

@ -1,3 +1,17 @@
2013-12-14 Ondřej Bílka <neleai@seznam.cz>
[BZ #12100]
* sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S: New file
* sysdeps/x86_64/multiarch/strstr-c.c: Moved to ...
* sysdeps/x86_64/multiarch/strstr.c: ... here.
(strstr): Add __strstr_sse2_unaligned ifunc.
* sysdeps/x86_64/multiarch/strcasestr-c.c: Moved to ...
* sysdeps/x86_64/multiarch/strcasestr.c ... here.
(strcasestr): Remove __strcasestr_sse42 ifunc.
* sysdeps/x86_64/multiarch/strcasestr-nonascii.c: Remove.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Update.
2013-12-14 Kaz Kojima <kkojima@rr.iij4u.or.jp>
* sysdeps/sh/sh4/fpu/bits/fenv.h: Move to ...

24
NEWS
View File

@ -10,18 +10,18 @@ Version 2.19
* The following bugs are resolved with this release:
156, 387, 431, 832, 926, 2801, 4772, 6786, 6787, 6807, 6810, 7003, 9954,
10253, 10278, 11087, 11157, 11214, 12486, 13028, 13982, 13985, 14029,
14032, 14143, 14155, 14547, 14699, 14752, 14876, 14910, 15004, 15048,
15089, 15218, 15268, 15277, 15308, 15362, 15374, 15400, 15425, 15427,
15483, 15522, 15531, 15532, 15593, 15601, 15608, 15609, 15610, 15632,
15640, 15670, 15672, 15680, 15681, 15723, 15734, 15735, 15736, 15748,
15749, 15754, 15760, 15763, 15764, 15797, 15799, 15825, 15843, 15844,
15847, 15849, 15855, 15856, 15857, 15859, 15867, 15886, 15887, 15890,
15892, 15893, 15895, 15897, 15901, 15905, 15909, 15915, 15917, 15919,
15921, 15923, 15939, 15941, 15948, 15963, 15966, 15985, 15988, 15997,
16032, 16034, 16036, 16037, 16038, 16041, 16055, 16071, 16072, 16074,
16077, 16078, 16103, 16112, 16143, 16144, 16146, 16150, 16151, 16153,
16167, 16172, 16195, 16214, 16245, 16271, 16274, 16283, 16289.
10253, 10278, 11087, 11157, 11214, 12100, 12486, 13028, 13982, 13985,
14029, 14032, 14143, 14155, 14547, 14699, 14752, 14876, 14910, 15004,
15048, 15089, 15218, 15268, 15277, 15308, 15362, 15374, 15400, 15425,
15427, 15483, 15522, 15531, 15532, 15593, 15601, 15608, 15609, 15610,
15632, 15640, 15670, 15672, 15680, 15681, 15723, 15734, 15735, 15736,
15748, 15749, 15754, 15760, 15763, 15764, 15797, 15799, 15825, 15843,
15844, 15847, 15849, 15855, 15856, 15857, 15859, 15867, 15886, 15887,
15890, 15892, 15893, 15895, 15897, 15901, 15905, 15909, 15915, 15917,
15919, 15921, 15923, 15939, 15941, 15948, 15963, 15966, 15985, 15988,
15997, 16032, 16034, 16036, 16037, 16038, 16041, 16055, 16071, 16072,
16074, 16077, 16078, 16103, 16112, 16143, 16144, 16146, 16150, 16151,
16153, 16167, 16172, 16195, 16214, 16245, 16271, 16274, 16283, 16289.
* The public headers no longer use __unused nor __block. This change is to
support compiling programs that are derived from BSD sources and use

View File

@ -11,22 +11,19 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memcmp-sse4 memcpy-ssse3 \
memcpy-sse2-unaligned mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
memmove-ssse3-back strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
strchr-sse2-no-bsf memcmp-ssse3
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
CFLAGS-strspn-c.c += -msse4
CFLAGS-strstr.c += -msse4
CFLAGS-strcasestr.c += -msse4
CFLAGS-strcasestr-nonascii.c += -msse4
endif
endif

View File

@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasestr.c. */
IFUNC_IMPL (i, name, strcasestr,
IFUNC_IMPL_ADD (array, i, strcasestr, HAS_SSE4_2,
__strcasestr_sse42)
IFUNC_IMPL_ADD (array, i, strcasestr, 1, __strcasestr_sse2))
/* Support sysdeps/x86_64/multiarch/strcat.S. */
@ -184,7 +182,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strstr-c.c. */
IFUNC_IMPL (i, name, strstr,
IFUNC_IMPL_ADD (array, i, strstr, HAS_SSE4_2, __strstr_sse42)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
/* Support sysdeps/x86_64/multiarch/wcscpy.S. */

View File

@ -1,19 +0,0 @@
/* Multiple versions of strcasestr
All versions must be listed in ifunc-impl-list.c. */
#include "init-arch.h"
#define STRCASESTR __strcasestr_sse2
#include "string/strcasestr.c"
extern char *__strcasestr_sse42 (const char *, const char *) attribute_hidden;
extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
#if 1
libc_ifunc (__strcasestr,
HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2);
#else
libc_ifunc (__strcasestr,
0 ? __strcasestr_sse42 : __strcasestr_sse2);
#endif

View File

@ -1,50 +0,0 @@
/* strstr with SSE4.2 intrinsics
Copyright (C) 2010-2013 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <ctype.h>
#include <xmmintrin.h>
/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
locale. */
static __m128i
__m128i_strloadu_tolower (const unsigned char *p)
{
union
{
char b[16];
__m128i x;
} u;
for (int i = 0; i < 16; ++i)
if (p[i] == 0)
{
u.b[i] = 0;
break;
}
else
u.b[i] = tolower (p[i]);
return u.x;
}
#define STRCASESTR_NONASCII
#define USE_AS_STRCASESTR
#define STRSTR_SSE42 __strcasestr_sse42_nonascii
#include "strstr.c"

View File

@ -1,7 +1,13 @@
extern char *__strcasestr_sse42_nonascii (const unsigned char *s1,
const unsigned char *s2)
attribute_hidden;
/* Multiple versions of strcasestr
All versions must be listed in ifunc-impl-list.c. */
#define USE_AS_STRCASESTR
#define STRSTR_SSE42 __strcasestr_sse42
#include "strstr.c"
#include "init-arch.h"
#define STRCASESTR __strcasestr_sse2
#include "string/strcasestr.c"
extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
libc_ifunc (__strcasestr,
__strcasestr_sse2);

View File

@ -1,47 +0,0 @@
/* Multiple versions of strstr.
All versions must be listed in ifunc-impl-list.c.
Copyright (C) 2012-2013 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* Redefine strstr so that the compiler won't complain about the type
mismatch with the IFUNC selector in strong_alias, below. */
#undef strstr
#define strstr __redirect_strstr
#include <string.h>
#undef strstr
#define STRSTR __strstr_sse2
#ifdef SHARED
# undef libc_hidden_builtin_def
# define libc_hidden_builtin_def(name) \
__hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
#endif
#include "string/strstr.c"
extern __typeof (__redirect_strstr) __strstr_sse42 attribute_hidden;
extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
#include "init-arch.h"
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
extern __typeof (__redirect_strstr) __libc_strstr;
libc_ifunc (__libc_strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2)
#undef strstr
strong_alias (__libc_strstr, strstr)

View File

@ -0,0 +1,374 @@
/* strstr with unaligned loads
Copyright (C) 2009-2013 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
ENTRY(__strstr_sse2_unaligned)
movzbl (%rsi), %eax
testb %al, %al
je L(empty)
movzbl 1(%rsi), %edx
testb %dl, %dl
je L(strchr)
movd %eax, %xmm1
movd %edx, %xmm2
movq %rdi, %rax
andl $4095, %eax
punpcklbw %xmm1, %xmm1
cmpq $4031, %rax
punpcklbw %xmm2, %xmm2
punpcklwd %xmm1, %xmm1
punpcklwd %xmm2, %xmm2
pshufd $0, %xmm1, %xmm1
pshufd $0, %xmm2, %xmm2
ja L(cross_page)
movdqu (%rdi), %xmm3
pxor %xmm5, %xmm5
movdqu 1(%rdi), %xmm4
movdqa %xmm3, %xmm6
pcmpeqb %xmm1, %xmm3
pcmpeqb %xmm2, %xmm4
movdqu 16(%rdi), %xmm0
pcmpeqb %xmm5, %xmm6
pminub %xmm4, %xmm3
movdqa %xmm3, %xmm4
movdqu 17(%rdi), %xmm3
pcmpeqb %xmm0, %xmm5
pcmpeqb %xmm2, %xmm3
por %xmm6, %xmm4
pcmpeqb %xmm1, %xmm0
pminub %xmm3, %xmm0
por %xmm5, %xmm0
pmovmskb %xmm4, %r8d
pmovmskb %xmm0, %eax
salq $16, %rax
orq %rax, %r8
je L(next_32_bytes)
L(next_pair_index):
bsf %r8, %rax
addq %rdi, %rax
cmpb $0, (%rax)
je L(zero1)
movzbl 2(%rsi), %edx
testb %dl, %dl
je L(found1)
cmpb 2(%rax), %dl
jne L(next_pair)
xorl %edx, %edx
jmp L(pair_loop_start)
.p2align 4
L(strchr):
movzbl %al, %esi
jmp __strchr_sse2
.p2align 4
L(pair_loop):
addq $1, %rdx
cmpb 2(%rax,%rdx), %cl
jne L(next_pair)
L(pair_loop_start):
movzbl 3(%rsi,%rdx), %ecx
testb %cl, %cl
jne L(pair_loop)
L(found1):
ret
L(zero1):
xorl %eax, %eax
ret
.p2align 4
L(next_pair):
leaq -1(%r8), %rax
andq %rax, %r8
jne L(next_pair_index)
.p2align 4
L(next_32_bytes):
movdqu 32(%rdi), %xmm3
pxor %xmm5, %xmm5
movdqu 33(%rdi), %xmm4
movdqa %xmm3, %xmm6
pcmpeqb %xmm1, %xmm3
pcmpeqb %xmm2, %xmm4
movdqu 48(%rdi), %xmm0
pcmpeqb %xmm5, %xmm6
pminub %xmm4, %xmm3
movdqa %xmm3, %xmm4
movdqu 49(%rdi), %xmm3
pcmpeqb %xmm0, %xmm5
pcmpeqb %xmm2, %xmm3
por %xmm6, %xmm4
pcmpeqb %xmm1, %xmm0
pminub %xmm3, %xmm0
por %xmm5, %xmm0
pmovmskb %xmm4, %eax
salq $32, %rax
pmovmskb %xmm0, %r8d
salq $48, %r8
orq %rax, %r8
je L(loop_header)
L(next_pair2_index):
bsfq %r8, %rax
addq %rdi, %rax
cmpb $0, (%rax)
je L(zero2)
movzbl 2(%rsi), %edx
testb %dl, %dl
je L(found2)
cmpb 2(%rax), %dl
jne L(next_pair2)
xorl %edx, %edx
jmp L(pair_loop2_start)
.p2align 4
L(pair_loop2):
addq $1, %rdx
cmpb 2(%rax,%rdx), %cl
jne L(next_pair2)
L(pair_loop2_start):
movzbl 3(%rsi,%rdx), %ecx
testb %cl, %cl
jne L(pair_loop2)
L(found2):
ret
L(zero2):
xorl %eax, %eax
ret
L(empty):
mov %rdi, %rax
ret
.p2align 4
L(next_pair2):
leaq -1(%r8), %rax
andq %rax, %r8
jne L(next_pair2_index)
L(loop_header):
movq $-512, %r11
movq %rdi, %r9
pxor %xmm7, %xmm7
andq $-64, %rdi
.p2align 4
L(loop):
movdqa 64(%rdi), %xmm3
movdqu 63(%rdi), %xmm6
movdqa %xmm3, %xmm0
pxor %xmm2, %xmm3
pxor %xmm1, %xmm6
movdqa 80(%rdi), %xmm10
por %xmm3, %xmm6
pminub %xmm10, %xmm0
movdqu 79(%rdi), %xmm3
pxor %xmm2, %xmm10
pxor %xmm1, %xmm3
movdqa 96(%rdi), %xmm9
por %xmm10, %xmm3
pminub %xmm9, %xmm0
pxor %xmm2, %xmm9
movdqa 112(%rdi), %xmm8
addq $64, %rdi
pminub %xmm6, %xmm3
movdqu 31(%rdi), %xmm4
pminub %xmm8, %xmm0
pxor %xmm2, %xmm8
pxor %xmm1, %xmm4
por %xmm9, %xmm4
pminub %xmm4, %xmm3
movdqu 47(%rdi), %xmm5
pxor %xmm1, %xmm5
por %xmm8, %xmm5
pminub %xmm5, %xmm3
pminub %xmm3, %xmm0
pcmpeqb %xmm7, %xmm0
pmovmskb %xmm0, %eax
testl %eax, %eax
je L(loop)
pminub (%rdi), %xmm6
pminub 32(%rdi),%xmm4
pminub 48(%rdi),%xmm5
pcmpeqb %xmm7, %xmm6
pcmpeqb %xmm7, %xmm5
pmovmskb %xmm6, %edx
movdqa 16(%rdi), %xmm8
pcmpeqb %xmm7, %xmm4
movdqu 15(%rdi), %xmm0
pmovmskb %xmm5, %r8d
movdqa %xmm8, %xmm3
pmovmskb %xmm4, %ecx
pcmpeqb %xmm1,%xmm0
pcmpeqb %xmm2,%xmm3
salq $32, %rcx
pcmpeqb %xmm7,%xmm8
salq $48, %r8
pminub %xmm0,%xmm3
orq %rcx, %rdx
por %xmm3,%xmm8
orq %rdx, %r8
pmovmskb %xmm8, %eax
salq $16, %rax
orq %rax, %r8
je L(loop)
L(next_pair_index3):
bsfq %r8, %rcx
addq %rdi, %rcx
cmpb $0, (%rcx)
je L(zero)
xorl %eax, %eax
movzbl 2(%rsi), %edx
testb %dl, %dl
je L(success3)
cmpb 1(%rcx), %dl
jne L(next_pair3)
jmp L(pair_loop_start3)
.p2align 4
L(pair_loop3):
addq $1, %rax
cmpb 1(%rcx,%rax), %dl
jne L(next_pair3)
L(pair_loop_start3):
movzbl 3(%rsi,%rax), %edx
testb %dl, %dl
jne L(pair_loop3)
L(success3):
lea -1(%rcx), %rax
ret
.p2align 4
L(next_pair3):
addq %rax, %r11
movq %rdi, %rax
subq %r9, %rax
cmpq %r11, %rax
jl L(switch_strstr)
leaq -1(%r8), %rax
andq %rax, %r8
jne L(next_pair_index3)
jmp L(loop)
.p2align 4
L(switch_strstr):
movq %rdi, %rdi
jmp __strstr_sse2
.p2align 4
L(cross_page):
movq %rdi, %rax
pxor %xmm0, %xmm0
andq $-64, %rax
movdqa (%rax), %xmm3
movdqu -1(%rax), %xmm4
movdqa %xmm3, %xmm8
movdqa 16(%rax), %xmm5
pcmpeqb %xmm1, %xmm4
pcmpeqb %xmm0, %xmm8
pcmpeqb %xmm2, %xmm3
movdqa %xmm5, %xmm7
pminub %xmm4, %xmm3
movdqu 15(%rax), %xmm4
pcmpeqb %xmm0, %xmm7
por %xmm3, %xmm8
movdqa %xmm5, %xmm3
movdqa 32(%rax), %xmm5
pcmpeqb %xmm1, %xmm4
pcmpeqb %xmm2, %xmm3
movdqa %xmm5, %xmm6
pmovmskb %xmm8, %ecx
pminub %xmm4, %xmm3
movdqu 31(%rax), %xmm4
por %xmm3, %xmm7
movdqa %xmm5, %xmm3
pcmpeqb %xmm0, %xmm6
movdqa 48(%rax), %xmm5
pcmpeqb %xmm1, %xmm4
pmovmskb %xmm7, %r8d
pcmpeqb %xmm2, %xmm3
pcmpeqb %xmm5, %xmm0
pminub %xmm4, %xmm3
movdqu 47(%rax), %xmm4
por %xmm3, %xmm6
movdqa %xmm5, %xmm3
salq $16, %r8
pcmpeqb %xmm1, %xmm4
pcmpeqb %xmm2, %xmm3
pmovmskb %xmm6, %r10d
pminub %xmm4, %xmm3
por %xmm3, %xmm0
salq $32, %r10
orq %r10, %r8
orq %rcx, %r8
movl %edi, %ecx
pmovmskb %xmm0, %edx
subl %eax, %ecx
salq $48, %rdx
orq %rdx, %r8
shrq %cl, %r8
je L(loop_header)
L(next_pair_index4):
bsfq %r8, %rax
addq %rdi, %rax
cmpb $0, (%rax)
je L(zero)
cmpq %rax,%rdi
je L(next_pair4)
movzbl 2(%rsi), %edx
testb %dl, %dl
je L(found3)
cmpb 1(%rax), %dl
jne L(next_pair4)
xorl %edx, %edx
jmp L(pair_loop_start4)
.p2align 4
L(pair_loop4):
addq $1, %rdx
cmpb 1(%rax,%rdx), %cl
jne L(next_pair4)
L(pair_loop_start4):
movzbl 3(%rsi,%rdx), %ecx
testb %cl, %cl
jne L(pair_loop4)
L(found3):
subq $1, %rax
ret
.p2align 4
L(next_pair4):
leaq -1(%r8), %rax
andq %rax, %r8
jne L(next_pair_index4)
jmp L(loop_header)
.p2align 4
L(found):
rep
ret
.p2align 4
L(zero):
xorl %eax, %eax
ret
END(__strstr_sse2_unaligned)

View File

@ -1,6 +1,6 @@
/* strstr with SSE4.2 intrinsics
Copyright (C) 2009-2013 Free Software Foundation, Inc.
Contributed by Intel Corporation.
/* Multiple versions of strstr.
All versions must be listed in ifunc-impl-list.c.
Copyright (C) 2012-2013 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@ -17,369 +17,31 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <nmmintrin.h>
#include "varshift.h"
/* Redefine strstr so that the compiler won't complain about the type
mismatch with the IFUNC selector in strong_alias, below. */
#undef strstr
#define strstr __redirect_strstr
#include <string.h>
#undef strstr
#ifndef STRSTR_SSE42
# define STRSTR_SSE42 __strstr_sse42
#define STRSTR __strstr_sse2
#ifdef SHARED
# undef libc_hidden_builtin_def
# define libc_hidden_builtin_def(name) \
__hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
#endif
#ifdef USE_AS_STRCASESTR
# include <ctype.h>
# include <locale/localeinfo.h>
#include "string/strstr.c"
# define LOADBYTE(C) tolower (C)
# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
#else
# define LOADBYTE(C) (C)
# define CMPBYTE(C1, C2) ((C1) == (C2))
#endif
extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
/* We use 0xe ordered-compare:
_SIDD_SBYTE_OPS
| _SIDD_CMP_EQUAL_ORDER
| _SIDD_LEAST_SIGNIFICANT
on pcmpistri to do the scanning and string comparsion requirements of
sub-string match. In the scanning phase, we process Cflag and ECX
index to locate the first fragment match; once the first fragment
match position has been identified, we do comparison of subsequent
string fragments until we can conclude false or true match; whe
n concluding a false match, we may need to repeat scanning process
from next relevant offset in the target string.
#include "init-arch.h"
In the scanning phase we have 4 cases:
case ECX CFlag ZFlag SFlag
1 16 0 0 0
2a 16 0 0 1
2b 16 0 1 0
2c 16 0 1 1
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
extern __typeof (__redirect_strstr) __libc_strstr;
libc_ifunc (__libc_strstr, HAS_FAST_UNALIGNED_LOAD ? __strstr_sse2_unaligned : __strstr_sse2)
1. No ordered-comparison match, both 16B fragments are valid, so
continue to next fragment.
2. No ordered-comparison match, there is EOS in either fragment,
2a. Zflg = 0, Sflg = 1, we continue
2b. Zflg = 1, Sflg = 0, we conclude no match and return.
2c. Zflg = 1, sflg = 1, lenth determine match or no match
In the string comparison phase, the 1st fragment match is fixed up
to produce ECX = 0. Subsequent fragment compare of nonzero index
and no match conclude a false match.
case ECX CFlag ZFlag SFlag
3 X 1 0 0/1
4a 0 1 0 0
4b 0 1 0 1
4c 0 < X 1 0 0/1
5 16 0 1 0
3. An initial ordered-comparison fragment match, we fix up to do
subsequent string comparison
4a. Continuation of fragment comparison of a string compare.
4b. EOS reached in the reference string, we conclude true match and
return
4c. String compare failed if index is nonzero, we need to go back to
scanning
5. failed string compare, go back to scanning
*/
#if !(defined USE_AS_STRCASESTR && defined STRCASESTR_NONASCII)
/* Simple replacement of movdqu to address 4KB boundary cross issue.
If EOS occurs within less than 16B before 4KB boundary, we don't
cross to next page. */
static __m128i
__m128i_strloadu (const unsigned char * p, __m128i zero)
{
if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
{
size_t offset = ((size_t) p & (16 - 1));
__m128i a = _mm_load_si128 ((__m128i *) (p - offset));
int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
if ((bmsk >> offset) != 0)
return __m128i_shift_right (a, offset);
}
return _mm_loadu_si128 ((__m128i *) p);
}
#endif
#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
locale and other which have single-byte letters only in the ASCII
range. */
static __m128i
__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
__m128i uchigh, __m128i lcqword)
{
__m128i frag = __m128i_strloadu (p, zero);
/* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
__m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
/* Compare if bytes are > 'A' - 1. */
__m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
/* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
__m128i mask = _mm_and_si128 (r2, r1);
/* Apply lowercase bit 6 mask for above mask bytes == ff. */
return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
}
#endif
/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
algorithm) overlap for a fully populated 16B vector.
Input parameter: 1st 16Byte loaded from the reference string of a
strstr function.
We don't use KMP algorithm if reference string is less than 16B. */
static int
__inline__ __attribute__ ((__always_inline__,))
KMP16Bovrlap (__m128i s2)
{
__m128i b = _mm_unpacklo_epi8 (s2, s2);
__m128i a = _mm_unpacklo_epi8 (b, b);
a = _mm_shuffle_epi32 (a, 0);
b = _mm_srli_si128 (s2, sizeof (char));
int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
/* _BitScanForward(&k1, bmsk); */
int k1;
__asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
if (!bmsk)
return 16;
else if (bmsk == 0x7fff)
return 1;
else if (!k1)
{
/* There are al least two distinct chars in s2. If byte 0 and 1 are
idential and the distinct value lies farther down, we can deduce
the next byte offset to restart full compare is least no earlier
than byte 3. */
return 3;
}
else
{
/* Byte 1 is not degenerated to byte 0. */
return k1 + 1;
}
}
char *
__attribute__ ((section (".text.sse4.2")))
STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
{
#define p1 s1
const unsigned char *p2 = s2;
#ifndef STRCASESTR_NONASCII
if (__builtin_expect (p2[0] == '\0', 0))
return (char *) p1;
if (__builtin_expect (p1[0] == '\0', 0))
return NULL;
/* Check if p1 length is 1 byte long. */
if (__builtin_expect (p1[1] == '\0', 0))
return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
#endif
#ifdef USE_AS_STRCASESTR
# ifndef STRCASESTR_NONASCII
if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
!= 0, 0))
return __strcasestr_sse42_nonascii (s1, s2);
const __m128i uclow = _mm_set1_epi8 (0x40);
const __m128i uchigh = _mm_set1_epi8 (0x5b);
const __m128i lcqword = _mm_set1_epi8 (0x20);
const __m128i zero = _mm_setzero_si128 ();
# define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
# else
# define strloadu __m128i_strloadu_tolower
# define zero _mm_setzero_si128 ()
# endif
#else
# define strloadu(p) __m128i_strloadu (p, zero)
const __m128i zero = _mm_setzero_si128 ();
#endif
/* p1 > 1 byte long. Load up to 16 bytes of fragment. */
__m128i frag1 = strloadu (p1);
__m128i frag2;
if (p2[1] != '\0')
/* p2 is > 1 byte long. */
frag2 = strloadu (p2);
else
frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
/* Unsigned bytes, equal order, does frag2 has null? */
int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
if (cmp_s & cmp_c)
{
int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
int len;
__asm ("bsfl %[bmsk], %[len]"
: [len] "=r" (len) : [bmsk] "r" (bmsk));
p1 += cmp;
if ((len + cmp) <= 16)
return (char *) p1;
/* Load up to 16 bytes of fragment. */
frag1 = strloadu (p1);
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
if ((len + cmp) <= 16)
return (char *) p1 + cmp;
}
if (cmp_s)
{
/* Adjust addr for 16B alginment in ensuing loop. */
while (!cmp_z)
{
p1 += cmp;
/* Load up to 16 bytes of fragment. */
frag1 = strloadu (p1);
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
/* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
once already, this time cmp will be zero and we can exit. */
if ((!cmp) & cmp_c)
break;
}
if (!cmp_c)
return NULL;
/* Since s2 is less than 16 bytes, com_c is definitive
determination of full match. */
return (char *) p1 + cmp;
}
/* General case, s2 is at least 16 bytes or more.
First, the common case of false-match at first byte of p2. */
const unsigned char *pt = NULL;
int kmp_fwd = 0;
re_trace:
while (!cmp_c)
{
/* frag1 has null. */
if (cmp_z)
return NULL;
/* frag 1 has no null, advance 16 bytes. */
p1 += 16;
/* Load up to 16 bytes of fragment. */
frag1 = strloadu (p1);
/* Unsigned bytes, equal order, is there a partial match? */
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
}
/* Next, handle initial positive match as first byte of p2. We have
a partial fragment match, make full determination until we reached
end of s2. */
if (!cmp)
{
if (cmp_z)
return (char *) p1;
pt = p1;
p1 += 16;
p2 += 16;
/* Load up to 16 bytes of fragment. */
frag2 = strloadu (p2);
}
else
{
/* Adjust 16B alignment. */
p1 += cmp;
pt = p1;
}
/* Load up to 16 bytes of fragment. */
frag1 = strloadu (p1);
/* Unsigned bytes, equal order, does frag2 has null? */
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
while (!(cmp | cmp_z | cmp_s))
{
p1 += 16;
p2 += 16;
/* Load up to 16 bytes of fragment. */
frag2 = strloadu (p2);
/* Load up to 16 bytes of fragment. */
frag1 = strloadu (p1);
/* Unsigned bytes, equal order, does frag2 has null? */
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
}
/* Full determination yielded a false result, retrace s1 to next
starting position.
Zflg 1 0 1 0/1
Sflg 0 1 1 0/1
cmp na 0 0 >0
action done done continue continue if s2 < s1
false match retrace s1 else false
*/
if (cmp_s & !cmp)
return (char *) pt;
if (cmp_z)
{
if (!cmp_s)
return NULL;
/* Handle both zero and sign flag set and s1 is shorter in
length. */
int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
int len;
int len1;
__asm ("bsfl %[bmsk], %[len]"
: [len] "=r" (len) : [bmsk] "r" (bmsk));
__asm ("bsfl %[bmsk1], %[len1]"
: [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
if (len >= len1)
return NULL;
}
else if (!cmp)
return (char *) pt;
/* Otherwise, we have to retrace and continue. Default of multiple
paths that need to retrace from next byte in s1. */
p2 = s2;
frag2 = strloadu (p2);
if (!kmp_fwd)
kmp_fwd = KMP16Bovrlap (frag2);
/* KMP algorithm predicted overlap needs to be corrected for
partial fragment compare. */
p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
/* Since s2 is at least 16 bytes long, we're certain there is no
match. */
if (p1[0] == '\0')
return NULL;
/* Load up to 16 bytes of fragment. */
frag1 = strloadu (p1);
/* Unsigned bytes, equal order, is there a partial match? */
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
goto re_trace;
}
#undef strstr
strong_alias (__libc_strstr, strstr)