From 8e96b93aa7855683d0be3c65ce81e66d0786ba84 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 24 Jul 2010 08:34:44 -0700 Subject: [PATCH] Speed up x86-64 strcasestr a bit moew. Using the new SSE4.2 instructions is cool but not really the fastest. Some older SSE instructions can do the trick faster. --- ChangeLog | 5 +++++ sysdeps/x86_64/multiarch/strstr.c | 16 +++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index 46985e63e5..4a22a07972 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2010-07-24 Ulrich Drepper + + * sysdeps/x86_64/multiarch/strstr.c (__m128i_strloadu_tolower): Use + lower-latency instructions. + 2010-07-23 Ulrich Drepper * string/test-strcasestr.c: New file. diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c index 32ab939330..e2b19a34bc 100644 --- a/sysdeps/x86_64/multiarch/strstr.c +++ b/sysdeps/x86_64/multiarch/strstr.c @@ -173,11 +173,17 @@ __m128i_strloadu_tolower (const unsigned char *p, __m128i rangeuc, { __m128i frag = __m128i_strloadu (p); - /* Convert frag to lower case for POSIX/C locale. */ - __m128i mask1 = _mm_cmpistrm (rangeuc, frag, 0x44); - __m128i mask2 = _mm_blendv_epi8 (u2ldelta, frag, mask1); - mask2 = _mm_sub_epi8 (mask2, u2ldelta); - return _mm_blendv_epi8 (frag, mask2, mask1); +#define UCLOW 0x4040404040404040ULL +#define UCHIGH 0x5a5a5a5a5a5a5a5aULL +#define LCQWORD 0x2020202020202020ULL + /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */ + __m128i r2 = _mm_cmpgt_epi8 (_mm_set1_epi64x (UCHIGH), frag); + /* Compare if bytes are > 'A' - 1. */ + __m128i r1 = _mm_cmpgt_epi8 (frag, _mm_set1_epi64x (UCLOW)); + /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */ + __m128i mask = _mm_and_si128 (r2, r1); + /* Apply lowercase bit 6 mask for above mask bytes == ff. */ + return _mm_or_si128 (frag, _mm_and_si128 (mask, _mm_set1_epi64x (LCQWORD))); } #endif