Improve qstring.cpp:findChar with SSE2

This updates only the case-sensitive searching. The case-insensitive
part requires quite a few Unicode transformations.

The benchmarks tried are the plain word-by-word comparison used in Qt
5.2 and in builds without SSE2; the SSE2 benchmark; a benchmark using
the SSE4.2 "strchr" instruction. I've run the benchmark both for CPU
cyles used as well as bytes/sec scanning strings.

Improvement over the Qt 5.2 code:

            GCC 4.7         GCC 4.9        Clang 3.4        ICC 14
          cycles  MB/s    cycles  MB/s    cycles  MB/s    cycles  MB/s
SSE2       2.1x   2.9x     2.2x   2.9x     2.1x   3.1x     2.2x   3.1x
SSE4.2     1.5x   1.7x     1.5x   1.7x     1.5x   1.7x     1.6x   1.8x

Once again, the SSE4.2 instruction wasn't as effective as I'd hoped
(not to mention that Clang seems to have some bugs emitting it).

Change-Id: I57c6e65e91791bb5265965cbd1af7fbd8fe7f588
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
This commit is contained in:
Thiago Macieira 2013-12-12 23:59:46 -08:00 committed by The Qt Project
parent 2cfdb35269
commit 8b77fe9f64

View File

@ -347,14 +347,32 @@ static int findChar(const QChar *str, int len, QChar ch, int from,
if (from < 0)
from = qMax(from + len, 0);
if (from < len) {
const ushort *n = s + from - 1;
const ushort *n = s + from;
const ushort *e = s + len;
if (cs == Qt::CaseSensitive) {
#ifdef __SSE2__
__m128i mch = _mm_set1_epi32(c | (c << 16));
// we're going to read n[0..7] (16 bytes)
for (const ushort *next = n + 8; next <= e; n = next, next += 8) {
__m128i data = _mm_loadu_si128((__m128i*)n);
__m128i result = _mm_cmpeq_epi16(data, mch);
uint mask = _mm_movemask_epi8(result);
if (ushort(mask)) {
// found a match
// same as: return n - s + _bit_scan_forward(mask) / 2
return (reinterpret_cast<const char *>(n) - reinterpret_cast<const char *>(s)
+ _bit_scan_forward(mask)) >> 1;
}
}
#endif
--n;
while (++n != e)
if (*n == c)
return n - s;
} else {
c = foldCase(c);
--n;
while (++n != e)
if (foldCase(*n) == c)
return n - s;