QString::fromLatin1: improve the sub-16-character case
For both the [4, 7] and [8,15] length cases, we can perform the same technique: perform two overlapped loads, zero-extend, then perform two overlapped stores. The 8-character case could be done in a single load/store pair, but is not worth the extra conditionals. And it should have the exact same performance numbers whether we use non-overlapping 4-character operations or completely-overlapping 8-character ones (I *think* the full overlap is actually better). The 4-character operation is new in this commit. That reduces the non-vectorized, unrolled to at most 3 characters. Change-Id: Ib42b3adc93bf4d43bd55fffd16c257ada774236a Reviewed-by: Lars Knoll <lars@knoll.priv.no>
This commit is contained in:
parent
2b9d4afc95
commit
3ef43ca837
@ -804,6 +804,7 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
|
||||
*/
|
||||
#if defined(__SSE2__)
|
||||
// we're going to read str[offset..offset+15] (16 bytes)
|
||||
const __m128i nullMask = _mm_setzero_si128();
|
||||
auto processOneChunk = [=](qptrdiff offset) {
|
||||
const __m128i chunk = _mm_loadu_si128((const __m128i*)(str + offset)); // load
|
||||
if constexpr (UseAvx2) {
|
||||
@ -813,8 +814,6 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
|
||||
// store
|
||||
_mm256_storeu_si256((__m256i*)(dst + offset), extended);
|
||||
} else {
|
||||
const __m128i nullMask = _mm_set1_epi32(0);
|
||||
|
||||
// unpack the first 8 bytes, padding with zeros
|
||||
const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
|
||||
_mm_storeu_si128((__m128i*)(dst + offset), firstHalf); // store
|
||||
@ -826,8 +825,8 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
|
||||
};
|
||||
|
||||
const char *e = str + size;
|
||||
qptrdiff offset = 0;
|
||||
if (size >= sizeof(__m128i)) {
|
||||
qptrdiff offset = 0;
|
||||
for ( ; str + offset + sizeof(__m128i) <= e; offset += sizeof(__m128i))
|
||||
processOneChunk(offset);
|
||||
if (str + offset < e)
|
||||
@ -836,17 +835,26 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
|
||||
}
|
||||
|
||||
# if !defined(__OPTIMIZE_SIZE__)
|
||||
// we're going to read str[offset..offset+7] (8 bytes)
|
||||
if (str + offset + 8 <= e) {
|
||||
const __m128i unpacked = mm_load8_zero_extend(str + offset);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), unpacked);
|
||||
offset += 8;
|
||||
if (size >= 4) {
|
||||
// two overlapped loads & stores, of either 64-bit or of 32-bit
|
||||
if (size >= 8) {
|
||||
const __m128i unpacked1 = mm_load8_zero_extend(str);
|
||||
const __m128i unpacked2 = mm_load8_zero_extend(str + size - 8);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), unpacked1);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 8), unpacked2);
|
||||
} else {
|
||||
const __m128i chunk1 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(str));
|
||||
const __m128i chunk2 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(str + size - 4));
|
||||
const __m128i unpacked1 = _mm_unpacklo_epi8(chunk1, nullMask);
|
||||
const __m128i unpacked2 = _mm_unpacklo_epi8(chunk2, nullMask);
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i *>(dst), unpacked1);
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i *>(dst + size - 4), unpacked2);
|
||||
}
|
||||
return;
|
||||
} else {
|
||||
size = size % 4;
|
||||
return UnrollTailLoop<3>::exec(qsizetype(size), [=](int i) { dst[i] = (uchar)str[i]; });
|
||||
}
|
||||
|
||||
size = size % 8;
|
||||
dst += offset;
|
||||
str += offset;
|
||||
return UnrollTailLoop<7>::exec(qsizetype(size), [=](qsizetype i) { dst[i] = (uchar)str[i]; });
|
||||
# endif
|
||||
#endif
|
||||
#if defined(__mips_dsp)
|
||||
|
@ -5295,19 +5295,46 @@ void tst_QString::fromLatin1Roundtrip()
|
||||
QCOMPARE(latin1.isEmpty(), unicode.isEmpty());
|
||||
QCOMPARE(latin1.size(), unicode.size());
|
||||
|
||||
if (!latin1.isEmpty())
|
||||
while (latin1.size() < 128) {
|
||||
auto roundtripTest = [&]() {
|
||||
// fromLatin1
|
||||
QString fromLatin1 = QString::fromLatin1(latin1, latin1.length());
|
||||
QCOMPARE(fromLatin1.length(), unicode.length());
|
||||
QCOMPARE(fromLatin1, unicode);
|
||||
|
||||
// and back:
|
||||
QByteArray toLatin1 = unicode.toLatin1();
|
||||
QCOMPARE(toLatin1.length(), latin1.length());
|
||||
QCOMPARE(toLatin1, latin1);
|
||||
};
|
||||
|
||||
roundtripTest();
|
||||
|
||||
if (latin1.isEmpty())
|
||||
return;
|
||||
|
||||
if (QTest::currentTestFailed()) QFAIL("failed");
|
||||
while (latin1.length() < 16) {
|
||||
latin1 += latin1;
|
||||
unicode += unicode;
|
||||
}
|
||||
roundtripTest();
|
||||
|
||||
// fromLatin1
|
||||
QCOMPARE(QString::fromLatin1(latin1, latin1.size()).size(), unicode.size());
|
||||
QCOMPARE(QString::fromLatin1(latin1, latin1.size()), unicode);
|
||||
// double again (length will be > 32)
|
||||
if (QTest::currentTestFailed()) QFAIL("failed");
|
||||
latin1 += latin1;
|
||||
unicode += unicode;
|
||||
roundtripTest();
|
||||
|
||||
// and back:
|
||||
QCOMPARE(unicode.toLatin1().size(), latin1.size());
|
||||
QCOMPARE(unicode.toLatin1(), latin1);
|
||||
// double again (length will be > 64)
|
||||
if (QTest::currentTestFailed()) QFAIL("failed");
|
||||
latin1 += latin1;
|
||||
unicode += unicode;
|
||||
roundtripTest();
|
||||
|
||||
if (QTest::currentTestFailed()) QFAIL("failed");
|
||||
latin1 += latin1;
|
||||
unicode += unicode;
|
||||
roundtripTest();
|
||||
}
|
||||
|
||||
void tst_QString::toLatin1Roundtrip_data()
|
||||
|
Loading…
Reference in New Issue
Block a user