QString::toLatin1: stop using PCMPESTRM
This instruction is somewhat slow and requires a lot of inputs to be correctly set. Instead, use the PMIN trick, which does have unsigned comparison support. This commit moves the helper function to a lambda inside qt_to_latin1, to make it easier to reuse the constants in the next commit and to avoid warnings of unused static functions. Change-Id: Ib48364abee9f464c96c6fffd152e9e84f4ad3ae8 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
parent
bd0905db95
commit
a9074779cf
@ -536,61 +536,6 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size) Q_DECL_NOTHROW
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__SSE2__)
|
||||
static inline __m128i mergeQuestionMarks(__m128i chunk)
|
||||
{
|
||||
const __m128i questionMark = _mm_set1_epi16('?');
|
||||
|
||||
# ifdef __SSE4_2__
|
||||
// compare the unsigned shorts for the range 0x0100-0xFFFF
|
||||
// note on the use of _mm_cmpestrm:
|
||||
// The MSDN documentation online (http://technet.microsoft.com/en-us/library/bb514080.aspx)
|
||||
// says for range search the following:
|
||||
// For each character c in a, determine whether b0 <= c <= b1 or b2 <= c <= b3
|
||||
//
|
||||
// However, all examples on the Internet, including from Intel
|
||||
// (see http://software.intel.com/en-us/articles/xml-parsing-accelerator-with-intel-streaming-simd-extensions-4-intel-sse4/)
|
||||
// put the range to be searched first
|
||||
//
|
||||
// Disassembly and instruction-level debugging with GCC and ICC show
|
||||
// that they are doing the right thing. Inverting the arguments in the
|
||||
// instruction does cause a bunch of test failures.
|
||||
|
||||
const __m128i rangeMatch = _mm_cvtsi32_si128(0xffff0100);
|
||||
const __m128i offLimitMask = _mm_cmpestrm(rangeMatch, 2, chunk, 8,
|
||||
_SIDD_UWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK);
|
||||
|
||||
// replace the non-Latin 1 characters in the chunk with question marks
|
||||
chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
|
||||
# else
|
||||
// SSE has no compare instruction for unsigned comparison.
|
||||
// The variables must be shiffted + 0x8000 to be compared
|
||||
const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000));
|
||||
const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000));
|
||||
|
||||
const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset);
|
||||
const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
|
||||
|
||||
# ifdef __SSE4_1__
|
||||
// replace the non-Latin 1 characters in the chunk with question marks
|
||||
chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
|
||||
# else
|
||||
// offLimitQuestionMark contains '?' for each 16 bits that was off-limit
|
||||
// the 16 bits that were correct contains zeros
|
||||
const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
|
||||
|
||||
// correctBytes contains the bytes that were in limit
|
||||
// the 16 bits that were off limits contains zeros
|
||||
const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk);
|
||||
|
||||
// merge offLimitQuestionMark and correctBytes to have the result
|
||||
chunk = _mm_or_si128(correctBytes, offLimitQuestionMark);
|
||||
# endif
|
||||
# endif
|
||||
return chunk;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <bool Checked>
|
||||
static void qt_to_latin1_internal(uchar *dst, const ushort *src, qsizetype length)
|
||||
{
|
||||
@ -598,6 +543,40 @@ static void qt_to_latin1_internal(uchar *dst, const ushort *src, qsizetype lengt
|
||||
uchar *e = dst + length;
|
||||
qptrdiff offset = 0;
|
||||
|
||||
const __m128i questionMark = _mm_set1_epi16('?');
|
||||
const __m128i outOfRange = _mm_set1_epi16(0x100);
|
||||
|
||||
auto mergeQuestionMarks = [=](__m128i chunk) {
|
||||
// SSE has no compare instruction for unsigned comparison.
|
||||
# ifdef __SSE4_1__
|
||||
// We use an unsigned uc = qMin(uc, 0x100) and then compare for equality.
|
||||
chunk = _mm_min_epu16(chunk, outOfRange);
|
||||
const __m128i offLimitMask = _mm_cmpeq_epi16(chunk, outOfRange);
|
||||
chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
|
||||
# else
|
||||
// The variables must be shiffted + 0x8000 to be compared
|
||||
const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000));
|
||||
const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000));
|
||||
|
||||
const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset);
|
||||
const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
|
||||
|
||||
// offLimitQuestionMark contains '?' for each 16 bits that was off-limit
|
||||
// the 16 bits that were correct contains zeros
|
||||
const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
|
||||
|
||||
// correctBytes contains the bytes that were in limit
|
||||
// the 16 bits that were off limits contains zeros
|
||||
const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk);
|
||||
|
||||
// merge offLimitQuestionMark and correctBytes to have the result
|
||||
chunk = _mm_or_si128(correctBytes, offLimitQuestionMark);
|
||||
|
||||
Q_UNUSED(outOfRange);
|
||||
# endif
|
||||
return chunk;
|
||||
};
|
||||
|
||||
// we're going to write to dst[offset..offset+15] (16 bytes)
|
||||
for ( ; dst + offset + 15 < e; offset += 16) {
|
||||
__m128i chunk1 = _mm_loadu_si128((const __m128i*)(src + offset)); // load
|
||||
|
Loading…
Reference in New Issue
Block a user