Improve ucstrncmp with SSE2
The benchmarks showed that the basic SSE2-based building block improves performance by about 50% with data extracted from a Qt Creator run. None of the other alternatives provide clear better results -- the best was 3.8% and with only one compiler. Change-Id: I77314785afecfacaf21c41fd79c97cadf357f895 Reviewed-by: Lars Knoll <lars.knoll@digia.com>
This commit is contained in:
parent
34821e226a
commit
f32a5b158f
@ -256,6 +256,12 @@ static __forceinline unsigned long _bit_scan_reverse(uint val)
|
||||
_BitScanReverse(&result, val);
|
||||
return result;
|
||||
}
|
||||
static __forceinline unsigned long _bit_scan_forward(uint val)
|
||||
{
|
||||
unsigned long result;
|
||||
_BitScanForward(&result, val);
|
||||
return result;
|
||||
}
|
||||
# elif (defined(Q_CC_CLANG) || (defined(Q_CC_GNU) && __GNUC__ * 100 + __GNUC_MINOR__ < 405)) \
|
||||
&& !defined(Q_CC_INTEL)
|
||||
// Clang is missing the intrinsic for _bit_scan_reverse
|
||||
@ -267,6 +273,13 @@ unsigned _bit_scan_reverse(unsigned val)
|
||||
asm("bsr %1, %0" : "=r" (result) : "r" (val));
|
||||
return result;
|
||||
}
|
||||
static inline __attribute__((always_inline))
|
||||
unsigned _bit_scan_forward(unsigned val)
|
||||
{
|
||||
unsigned result;
|
||||
asm("bsf %1, %0" : "=r" (result) : "r" (val));
|
||||
return result;
|
||||
}
|
||||
# endif
|
||||
#endif // Q_PROCESSOR_X86
|
||||
|
||||
|
@ -102,6 +102,36 @@
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
/*
|
||||
* Note on the use of SIMD in qstring.cpp:
|
||||
*
|
||||
* Several operations with strings are improved with the use of SIMD code,
|
||||
* since they are repetitive. For MIPS, we have hand-written assembly code
|
||||
* outside of qstring.cpp targeting MIPS DSP and MIPS DSPr2. For ARM and for
|
||||
* x86, we can only use intrinsics and therefore everything is contained in
|
||||
* qstring.cpp. We need to use intrinsics only for those platforms due to the
|
||||
* different compilers and toolchains used, which have different syntax for
|
||||
* assembly sources.
|
||||
*
|
||||
* ** SSE notes: **
|
||||
*
|
||||
* Whenever multiple alternatives are equivalent or near so, we prefer the one
|
||||
* using instructions from SSE2, since SSE2 is guaranteed to be enabled for all
|
||||
* 64-bit builds and we enable it for 32-bit builds by default. Use of higher
|
||||
* SSE versions should be done when there's a clear performance benefit and
|
||||
* requires fallback code to SSE2, if it exists.
|
||||
*
|
||||
* Performance measurement in the past shows that most strings are short in
|
||||
* size and, therefore, do not benefit from alignment prologues. That is,
|
||||
* trying to find a 16-byte-aligned boundary to operate on is often more
|
||||
* expensive than executing the unaligned operation directly. In addition, note
|
||||
* that the QString private data is designed so that the data is stored on
|
||||
* 16-byte boundaries if the system malloc() returns 16-byte aligned pointers
|
||||
* on its own (64-bit glibc on Linux does; 32-bit glibc on Linux returns them
|
||||
* 50% of the time), so skipping the alignment prologue is actually optimizing
|
||||
* for the common case.
|
||||
*/
|
||||
|
||||
// internal
|
||||
int qFindString(const QChar *haystack, int haystackLen, int from,
|
||||
const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
|
||||
@ -206,6 +236,28 @@ static int ucstrncmp(const QChar *a, const QChar *b, int l)
|
||||
l);
|
||||
}
|
||||
#endif // __mips_dsp
|
||||
#ifdef __SSE2__
|
||||
const char *ptr = reinterpret_cast<const char*>(a);
|
||||
qptrdiff distance = reinterpret_cast<const char*>(b) - ptr;
|
||||
a += l & ~7;
|
||||
b += l & ~7;
|
||||
l &= 7;
|
||||
|
||||
// we're going to read ptr[0..15] (16 bytes)
|
||||
for ( ; ptr + 15 < reinterpret_cast<const char *>(a); ptr += 16) {
|
||||
__m128i a_data = _mm_loadu_si128((__m128i*)ptr);
|
||||
__m128i b_data = _mm_loadu_si128((__m128i*)(ptr + distance));
|
||||
__m128i result = _mm_cmpeq_epi16(a_data, b_data);
|
||||
uint mask = ~_mm_movemask_epi8(result);
|
||||
if (ushort(mask)) {
|
||||
// found a different byte
|
||||
uint idx = uint(_bit_scan_forward(mask));
|
||||
return reinterpret_cast<const QChar *>(ptr + idx)->unicode()
|
||||
- reinterpret_cast<const QChar *>(ptr + distance + idx)->unicode();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
while (l-- && *a == *b)
|
||||
a++,b++;
|
||||
if (l==-1)
|
||||
|
Loading…
Reference in New Issue
Block a user