QString::toLatin1: stop using PCMPESTRM

This instruction is somewhat slow and requires a lot of inputs to be
correctly set. Instead, use the PMIN trick, which does have unsigned
comparison support.

This commit moves the helper function to a lambda inside qt_to_latin1,
to make it easier to reuse the constants in the next commit and to avoid
warnings of unused static functions.

Change-Id: Ib48364abee9f464c96c6fffd152e9e84f4ad3ae8
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
Thiago Macieira 2018-05-14 13:57:30 -07:00
parent bd0905db95
commit a9074779cf

View File

@ -536,61 +536,6 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size) Q_DECL_NOTHROW
#endif
}
#if defined(__SSE2__)
static inline __m128i mergeQuestionMarks(__m128i chunk)
{
const __m128i questionMark = _mm_set1_epi16('?');
# ifdef __SSE4_2__
// compare the unsigned shorts for the range 0x0100-0xFFFF
// note on the use of _mm_cmpestrm:
// The MSDN documentation online (http://technet.microsoft.com/en-us/library/bb514080.aspx)
// says for range search the following:
// For each character c in a, determine whether b0 <= c <= b1 or b2 <= c <= b3
//
// However, all examples on the Internet, including from Intel
// (see http://software.intel.com/en-us/articles/xml-parsing-accelerator-with-intel-streaming-simd-extensions-4-intel-sse4/)
// put the range to be searched first
//
// Disassembly and instruction-level debugging with GCC and ICC show
// that they are doing the right thing. Inverting the arguments in the
// instruction does cause a bunch of test failures.
const __m128i rangeMatch = _mm_cvtsi32_si128(0xffff0100);
const __m128i offLimitMask = _mm_cmpestrm(rangeMatch, 2, chunk, 8,
_SIDD_UWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK);
// replace the non-Latin 1 characters in the chunk with question marks
chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
# else
// SSE has no compare instruction for unsigned comparison.
// The variables must be shiffted + 0x8000 to be compared
const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000));
const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000));
const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset);
const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
# ifdef __SSE4_1__
// replace the non-Latin 1 characters in the chunk with question marks
chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
# else
// offLimitQuestionMark contains '?' for each 16 bits that was off-limit
// the 16 bits that were correct contains zeros
const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
// correctBytes contains the bytes that were in limit
// the 16 bits that were off limits contains zeros
const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk);
// merge offLimitQuestionMark and correctBytes to have the result
chunk = _mm_or_si128(correctBytes, offLimitQuestionMark);
# endif
# endif
return chunk;
}
#endif
template <bool Checked>
static void qt_to_latin1_internal(uchar *dst, const ushort *src, qsizetype length)
{
@ -598,6 +543,40 @@ static void qt_to_latin1_internal(uchar *dst, const ushort *src, qsizetype lengt
uchar *e = dst + length;
qptrdiff offset = 0;
const __m128i questionMark = _mm_set1_epi16('?');
const __m128i outOfRange = _mm_set1_epi16(0x100);
auto mergeQuestionMarks = [=](__m128i chunk) {
// SSE has no compare instruction for unsigned comparison.
# ifdef __SSE4_1__
// We use an unsigned uc = qMin(uc, 0x100) and then compare for equality.
chunk = _mm_min_epu16(chunk, outOfRange);
const __m128i offLimitMask = _mm_cmpeq_epi16(chunk, outOfRange);
chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
# else
// The variables must be shiffted + 0x8000 to be compared
const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000));
const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000));
const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset);
const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
// offLimitQuestionMark contains '?' for each 16 bits that was off-limit
// the 16 bits that were correct contains zeros
const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
// correctBytes contains the bytes that were in limit
// the 16 bits that were off limits contains zeros
const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk);
// merge offLimitQuestionMark and correctBytes to have the result
chunk = _mm_or_si128(correctBytes, offLimitQuestionMark);
Q_UNUSED(outOfRange);
# endif
return chunk;
};
// we're going to write to dst[offset..offset+15] (16 bytes)
for ( ; dst + offset + 15 < e; offset += 16) {
__m128i chunk1 = _mm_loadu_si128((const __m128i*)(src + offset)); // load