QString::fromLatin1: improve the sub-16-character case

For both the [4, 7] and [8,15] length cases, we can perform the same
technique: perform two overlapped loads, zero-extend, then perform two
overlapped stores. The 8-character case could be done in a single
load/store pair, but is not worth the extra conditionals. And it should
have the exact same performance numbers whether we use non-overlapping
4-character operations or completely-overlapping 8-character ones (I
*think* the full overlap is actually better).

The 4-character operation is new in this commit. That reduces the
non-vectorized, unrolled to at most 3 characters.

Change-Id: Ib42b3adc93bf4d43bd55fffd16c257ada774236a
Reviewed-by: Lars Knoll <lars@knoll.priv.no>
This commit is contained in:
Thiago Macieira 2021-12-20 00:00:39 -03:00
parent 2b9d4afc95
commit 3ef43ca837
2 changed files with 59 additions and 24 deletions

View File

@ -804,6 +804,7 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
*/
#if defined(__SSE2__)
// we're going to read str[offset..offset+15] (16 bytes)
const __m128i nullMask = _mm_setzero_si128();
auto processOneChunk = [=](qptrdiff offset) {
const __m128i chunk = _mm_loadu_si128((const __m128i*)(str + offset)); // load
if constexpr (UseAvx2) {
@ -813,8 +814,6 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
// store
_mm256_storeu_si256((__m256i*)(dst + offset), extended);
} else {
const __m128i nullMask = _mm_set1_epi32(0);
// unpack the first 8 bytes, padding with zeros
const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
_mm_storeu_si128((__m128i*)(dst + offset), firstHalf); // store
@ -826,8 +825,8 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
};
const char *e = str + size;
qptrdiff offset = 0;
if (size >= sizeof(__m128i)) {
qptrdiff offset = 0;
for ( ; str + offset + sizeof(__m128i) <= e; offset += sizeof(__m128i))
processOneChunk(offset);
if (str + offset < e)
@ -836,17 +835,26 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
}
# if !defined(__OPTIMIZE_SIZE__)
// we're going to read str[offset..offset+7] (8 bytes)
if (str + offset + 8 <= e) {
const __m128i unpacked = mm_load8_zero_extend(str + offset);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), unpacked);
offset += 8;
if (size >= 4) {
// two overlapped loads & stores, of either 64-bit or of 32-bit
if (size >= 8) {
const __m128i unpacked1 = mm_load8_zero_extend(str);
const __m128i unpacked2 = mm_load8_zero_extend(str + size - 8);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), unpacked1);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 8), unpacked2);
} else {
const __m128i chunk1 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(str));
const __m128i chunk2 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(str + size - 4));
const __m128i unpacked1 = _mm_unpacklo_epi8(chunk1, nullMask);
const __m128i unpacked2 = _mm_unpacklo_epi8(chunk2, nullMask);
_mm_storel_epi64(reinterpret_cast<__m128i *>(dst), unpacked1);
_mm_storel_epi64(reinterpret_cast<__m128i *>(dst + size - 4), unpacked2);
}
return;
} else {
size = size % 4;
return UnrollTailLoop<3>::exec(qsizetype(size), [=](int i) { dst[i] = (uchar)str[i]; });
}
size = size % 8;
dst += offset;
str += offset;
return UnrollTailLoop<7>::exec(qsizetype(size), [=](qsizetype i) { dst[i] = (uchar)str[i]; });
# endif
#endif
#if defined(__mips_dsp)

View File

@ -5295,19 +5295,46 @@ void tst_QString::fromLatin1Roundtrip()
QCOMPARE(latin1.isEmpty(), unicode.isEmpty());
QCOMPARE(latin1.size(), unicode.size());
if (!latin1.isEmpty())
while (latin1.size() < 128) {
auto roundtripTest = [&]() {
// fromLatin1
QString fromLatin1 = QString::fromLatin1(latin1, latin1.length());
QCOMPARE(fromLatin1.length(), unicode.length());
QCOMPARE(fromLatin1, unicode);
// and back:
QByteArray toLatin1 = unicode.toLatin1();
QCOMPARE(toLatin1.length(), latin1.length());
QCOMPARE(toLatin1, latin1);
};
roundtripTest();
if (latin1.isEmpty())
return;
if (QTest::currentTestFailed()) QFAIL("failed");
while (latin1.length() < 16) {
latin1 += latin1;
unicode += unicode;
}
roundtripTest();
// fromLatin1
QCOMPARE(QString::fromLatin1(latin1, latin1.size()).size(), unicode.size());
QCOMPARE(QString::fromLatin1(latin1, latin1.size()), unicode);
// double again (length will be > 32)
if (QTest::currentTestFailed()) QFAIL("failed");
latin1 += latin1;
unicode += unicode;
roundtripTest();
// and back:
QCOMPARE(unicode.toLatin1().size(), latin1.size());
QCOMPARE(unicode.toLatin1(), latin1);
// double again (length will be > 64)
if (QTest::currentTestFailed()) QFAIL("failed");
latin1 += latin1;
unicode += unicode;
roundtripTest();
if (QTest::currentTestFailed()) QFAIL("failed");
latin1 += latin1;
unicode += unicode;
roundtripTest();
}
void tst_QString::toLatin1Roundtrip_data()