QString: improve the SSE2 qustrlen to calculate the length as we go

Instead of updating the pointer that we're reading, update the offset
(which is the length). The number of variables we're operating on is the
same (2), but this simplifies the calculation at the end.

BEFORE                             | AFTER
        tzcntl  %edx, %edx         |         tzcntl  %edx, %eax
        subq    %rdi, %rax         |
        sarq    %rax               |
        shrl    %edx               |         shrq    %rax
        addq    %rdx, %rax         |         leaq    (%rax,%rcx), %rax
        ret                        |         ret

We remove one subtraction and one shift. I don't know why it decided to
use LEA instead of ADD... The shift changed from 32- to 64-bit because
we cleaned up the constant 2 (an int) in the file with sizeof(char16_t)
(a size_t), but that has no effect in performance.

Change-Id: I0e5f6bec596a4a78bd3bfffd16c9650a60289f4c
Reviewed-by: Lars Knoll <lars@knoll.priv.no>
This commit is contained in:
Thiago Macieira 2022-01-11 18:30:34 -08:00
parent 1acd028b0c
commit 23f1d68b09

View File

@ -428,7 +428,7 @@ static qsizetype qustrlen_sse2(const char16_t *str) noexcept
const __m128i zeroes = _mm_setzero_si128();
__m128i data = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr));
__m128i comparison = _mm_cmpeq_epi16(data, zeroes);
quint32 mask = _mm_movemask_epi8(comparison);
uint mask = _mm_movemask_epi8(comparison);
// ignore the result prior to the beginning of str
mask >>= misalignment;
@ -436,19 +436,22 @@ static qsizetype qustrlen_sse2(const char16_t *str) noexcept
// Have we found something in the first block? Need to handle it now
// because of the left shift above.
if (mask)
return qCountTrailingZeroBits(quint32(mask)) / 2;
return qCountTrailingZeroBits(mask) / sizeof(char16_t);
constexpr qsizetype Step = sizeof(__m128i) / sizeof(char16_t);
qsizetype size = Step - misalignment / sizeof(char16_t);
size -= Step;
do {
ptr += 8;
data = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr));
size += Step;
data = _mm_load_si128(reinterpret_cast<const __m128i *>(str + size));
comparison = _mm_cmpeq_epi16(data, zeroes);
mask = _mm_movemask_epi8(comparison);
} while (mask == 0);
// found a null
uint idx = qCountTrailingZeroBits(quint32(mask));
return ptr - str + idx / 2;
return size + qCountTrailingZeroBits(mask) / sizeof(char16_t);
}
// Scans from \a ptr to \a end until \a maskval is non-zero. Returns true if