Add an AVX2 code path to qustrchr

The new loop does 32 bytes (16 code units) at a time

Change-Id: I8f261579aad648fdb4f0fffd155412a4d77428e9
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
Thiago Macieira 2018-09-13 14:24:24 -07:00
parent 1fbd8caca6
commit 482da2e4d2

View File

@ -265,9 +265,27 @@ const ushort *QtPrivate::qustrchr(QStringView str, ushort c) noexcept
const ushort *e = reinterpret_cast<const ushort *>(str.end());
#ifdef __SSE2__
bool loops = true;
// Using the PMOVMSKB instruction, we get two bits for each character
// we compare.
# if defined(__AVX2__) && !defined(__OPTIMIZE_SIZE__)
// we're going to read n[0..15] (32 bytes)
__m256i mch256 = _mm256_set1_epi32(c | (c << 16));
for (const ushort *next = n + 16; next <= e; n = next, next += 16) {
__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
__m256i result = _mm256_cmpeq_epi16(data, mch256);
uint mask = uint(_mm256_movemask_epi8(result));
if (mask) {
uint idx = qCountTrailingZeroBits(mask);
return n + idx / 2;
}
}
loops = false;
__m128i mch = _mm256_castsi256_si128(mch256);
# else
__m128i mch = _mm_set1_epi32(c | (c << 16));
# endif
auto hasMatch = [mch, &n](__m128i data, ushort validityMask) {
__m128i result = _mm_cmpeq_epi16(data, mch);
uint mask = uint(_mm_movemask_epi8(result));
@ -283,6 +301,11 @@ const ushort *QtPrivate::qustrchr(QStringView str, ushort c) noexcept
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(n));
if (hasMatch(data, 0xffff))
return n;
if (!loops) {
n += 8;
break;
}
}
# if !defined(__OPTIMIZE_SIZE__)