Improve ucstrncmp with SSE2

The benchmarks showed that the basic SSE2-based building block
improves performance by about 50% with data extracted from a Qt
Creator run. None of the other alternatives provide clear better
results -- the best was 3.8% and with only one compiler.

Change-Id: I77314785afecfacaf21c41fd79c97cadf357f895
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
This commit is contained in:
Thiago Macieira 2013-12-12 22:41:04 -08:00 committed by The Qt Project
parent 34821e226a
commit f32a5b158f
2 changed files with 65 additions and 0 deletions

View File

@ -256,6 +256,12 @@ static __forceinline unsigned long _bit_scan_reverse(uint val)
_BitScanReverse(&result, val);
return result;
}
static __forceinline unsigned long _bit_scan_forward(uint val)
{
unsigned long result;
_BitScanForward(&result, val);
return result;
}
# elif (defined(Q_CC_CLANG) || (defined(Q_CC_GNU) && __GNUC__ * 100 + __GNUC_MINOR__ < 405)) \
&& !defined(Q_CC_INTEL)
// Clang is missing the intrinsic for _bit_scan_reverse
@ -267,6 +273,13 @@ unsigned _bit_scan_reverse(unsigned val)
asm("bsr %1, %0" : "=r" (result) : "r" (val));
return result;
}
static inline __attribute__((always_inline))
unsigned _bit_scan_forward(unsigned val)
{
unsigned result;
asm("bsf %1, %0" : "=r" (result) : "r" (val));
return result;
}
# endif
#endif // Q_PROCESSOR_X86

View File

@ -102,6 +102,36 @@
QT_BEGIN_NAMESPACE
/*
* Note on the use of SIMD in qstring.cpp:
*
* Several operations with strings are improved with the use of SIMD code,
* since they are repetitive. For MIPS, we have hand-written assembly code
* outside of qstring.cpp targeting MIPS DSP and MIPS DSPr2. For ARM and for
* x86, we can only use intrinsics and therefore everything is contained in
* qstring.cpp. We need to use intrinsics only for those platforms due to the
* different compilers and toolchains used, which have different syntax for
* assembly sources.
*
* ** SSE notes: **
*
* Whenever multiple alternatives are equivalent or near so, we prefer the one
* using instructions from SSE2, since SSE2 is guaranteed to be enabled for all
* 64-bit builds and we enable it for 32-bit builds by default. Use of higher
* SSE versions should be done when there's a clear performance benefit and
* requires fallback code to SSE2, if it exists.
*
* Performance measurement in the past shows that most strings are short in
* size and, therefore, do not benefit from alignment prologues. That is,
* trying to find a 16-byte-aligned boundary to operate on is often more
* expensive than executing the unaligned operation directly. In addition, note
* that the QString private data is designed so that the data is stored on
* 16-byte boundaries if the system malloc() returns 16-byte aligned pointers
* on its own (64-bit glibc on Linux does; 32-bit glibc on Linux returns them
* 50% of the time), so skipping the alignment prologue is actually optimizing
* for the common case.
*/
// internal
int qFindString(const QChar *haystack, int haystackLen, int from,
const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
@ -206,6 +236,28 @@ static int ucstrncmp(const QChar *a, const QChar *b, int l)
l);
}
#endif // __mips_dsp
#ifdef __SSE2__
const char *ptr = reinterpret_cast<const char*>(a);
qptrdiff distance = reinterpret_cast<const char*>(b) - ptr;
a += l & ~7;
b += l & ~7;
l &= 7;
// we're going to read ptr[0..15] (16 bytes)
for ( ; ptr + 15 < reinterpret_cast<const char *>(a); ptr += 16) {
__m128i a_data = _mm_loadu_si128((__m128i*)ptr);
__m128i b_data = _mm_loadu_si128((__m128i*)(ptr + distance));
__m128i result = _mm_cmpeq_epi16(a_data, b_data);
uint mask = ~_mm_movemask_epi8(result);
if (ushort(mask)) {
// found a different byte
uint idx = uint(_bit_scan_forward(mask));
return reinterpret_cast<const QChar *>(ptr + idx)->unicode()
- reinterpret_cast<const QChar *>(ptr + distance + idx)->unicode();
}
}
#endif
while (l-- && *a == *b)
a++,b++;
if (l==-1)