QUtf8Codec: Use one 32-byte load instead of two 16-byte ones on AVX2
The number of instructions is the same. But if the CPU can issue 32-byte-wide loads, this will be faster. For CPUs that would do two 16-byte loads, this is no worse than current code. Change-Id: I8f261579aad648fdb4f0fffd1553d060b4fc852f Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
parent
a0907e6ac1
commit
570ef11c28
@ -70,9 +70,14 @@ static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const
|
||||
{
|
||||
// do sixteen characters at a time
|
||||
for ( ; end - src >= 16; src += 16, dst += 16) {
|
||||
# ifdef __AVX2__
|
||||
__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
|
||||
__m128i data1 = _mm256_castsi256_si128(data);
|
||||
__m128i data2 = _mm256_extracti128_si256(data, 1);
|
||||
# else
|
||||
__m128i data1 = _mm_loadu_si128((const __m128i*)src);
|
||||
__m128i data2 = _mm_loadu_si128(1+(const __m128i*)src);
|
||||
|
||||
# endif
|
||||
|
||||
// check if everything is ASCII
|
||||
// the highest ASCII value is U+007F
|
||||
|
Loading…
Reference in New Issue
Block a user