QString::toLatin1: do the same as fromLatin1 for sub-16 characters
Perform a pair of overlapped loads & stores, so we can perform the contracting and inserting of question marks in vector code. Change-Id: Ib42b3adc93bf4d43bd55fffd16c2f5e479b0f6fa Reviewed-by: Lars Knoll <lars@knoll.priv.no>
This commit is contained in:
parent
3ef43ca837
commit
339aa99fec
@ -955,10 +955,9 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len
|
||||
return _mm_packus_epi16(chunk1, chunk2);
|
||||
};
|
||||
|
||||
uchar *e = dst + length;
|
||||
qptrdiff offset = 0;
|
||||
if (size_t(length) >= sizeof(__m128i)) {
|
||||
// because of possible overlapping, we won't process the last chunk in the loop
|
||||
qptrdiff offset = 0;
|
||||
for ( ; offset + 2 * sizeof(__m128i) < size_t(length); offset += sizeof(__m128i))
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), loadChunkAt(offset));
|
||||
|
||||
@ -971,45 +970,45 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len
|
||||
}
|
||||
|
||||
# if !defined(__OPTIMIZE_SIZE__)
|
||||
// we're going to write to dst[offset..offset+7] (8 bytes)
|
||||
if (dst + offset + 7 < e) {
|
||||
__m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + offset));
|
||||
chunk = mergeQuestionMarks(chunk);
|
||||
if (length >= 4) {
|
||||
// this code is fine even for in-place conversion because we load both
|
||||
// before any store
|
||||
if (length >= 8) {
|
||||
__m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
|
||||
__m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + length - 8));
|
||||
chunk1 = mergeQuestionMarks(chunk1);
|
||||
chunk2 = mergeQuestionMarks(chunk2);
|
||||
|
||||
// pack, where the upper half is ignored
|
||||
const __m128i result = _mm_packus_epi16(chunk, chunk);
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i *>(dst + offset), result);
|
||||
offset += 8;
|
||||
}
|
||||
// pack, where the upper half is ignored
|
||||
const __m128i result1 = _mm_packus_epi16(chunk1, chunk1);
|
||||
const __m128i result2 = _mm_packus_epi16(chunk2, chunk2);
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i *>(dst), result1);
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i *>(dst + length - 8), result2);
|
||||
} else {
|
||||
__m128i chunk1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
|
||||
__m128i chunk2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src + length - 4));
|
||||
chunk1 = mergeQuestionMarks(chunk1);
|
||||
chunk2 = mergeQuestionMarks(chunk2);
|
||||
|
||||
// we're going to write to dst[offset..offset+3] (4 bytes)
|
||||
if (dst + offset + 3 < e) {
|
||||
__m128i chunk = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src + offset));
|
||||
chunk = mergeQuestionMarks(chunk);
|
||||
|
||||
// pack, we'll the upper three quarters
|
||||
const __m128i result = _mm_packus_epi16(chunk, chunk);
|
||||
qToUnaligned(_mm_cvtsi128_si32(result), dst + offset);
|
||||
offset += 4;
|
||||
// pack, we'll zero the upper three quarters
|
||||
const __m128i result1 = _mm_packus_epi16(chunk1, chunk1);
|
||||
const __m128i result2 = _mm_packus_epi16(chunk2, chunk2);
|
||||
qToUnaligned(_mm_cvtsi128_si32(result1), dst);
|
||||
qToUnaligned(_mm_cvtsi128_si32(result2), dst + length - 4);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
length = length % 4;
|
||||
# else
|
||||
length = length % 16;
|
||||
# endif // optimize size
|
||||
|
||||
// advance dst, src for tail processing
|
||||
dst += offset;
|
||||
src += offset;
|
||||
|
||||
# if !defined(__OPTIMIZE_SIZE__)
|
||||
return UnrollTailLoop<3>::exec(length, [=](qsizetype i) {
|
||||
if (Checked)
|
||||
dst[i] = (src[i]>0xff) ? '?' : (uchar) src[i];
|
||||
else
|
||||
dst[i] = src[i];
|
||||
});
|
||||
# endif
|
||||
# else
|
||||
length = length % 16;
|
||||
# endif // optimize size
|
||||
#elif defined(__ARM_NEON__)
|
||||
// Refer to the documentation of the SSE2 implementation.
|
||||
// This uses exactly the same method as for SSE except:
|
||||
|
Loading…
Reference in New Issue
Block a user