From a9074779cf1425b76c010b891403a0521e2cb4e4 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Mon, 14 May 2018 13:57:30 -0700 Subject: [PATCH] QString::toLatin1: stop using PCMPESTRM This instruction is somewhat slow and requires a lot of inputs to be correctly set. Instead, use the PMIN trick, which does have unsigned comparison support. This commit moves the helper function to a lambda inside qt_to_latin1, to make it easier to reuse the constants in the next commit and to avoid warnings of unused static functions. Change-Id: Ib48364abee9f464c96c6fffd152e9e84f4ad3ae8 Reviewed-by: Allan Sandfeld Jensen --- src/corelib/tools/qstring.cpp | 89 +++++++++++++---------------------- 1 file changed, 34 insertions(+), 55 deletions(-) diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index d045913b87..639f6f8968 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -536,61 +536,6 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size) Q_DECL_NOTHROW #endif } -#if defined(__SSE2__) -static inline __m128i mergeQuestionMarks(__m128i chunk) -{ - const __m128i questionMark = _mm_set1_epi16('?'); - -# ifdef __SSE4_2__ - // compare the unsigned shorts for the range 0x0100-0xFFFF - // note on the use of _mm_cmpestrm: - // The MSDN documentation online (http://technet.microsoft.com/en-us/library/bb514080.aspx) - // says for range search the following: - // For each character c in a, determine whether b0 <= c <= b1 or b2 <= c <= b3 - // - // However, all examples on the Internet, including from Intel - // (see http://software.intel.com/en-us/articles/xml-parsing-accelerator-with-intel-streaming-simd-extensions-4-intel-sse4/) - // put the range to be searched first - // - // Disassembly and instruction-level debugging with GCC and ICC show - // that they are doing the right thing. Inverting the arguments in the - // instruction does cause a bunch of test failures. - - const __m128i rangeMatch = _mm_cvtsi32_si128(0xffff0100); - const __m128i offLimitMask = _mm_cmpestrm(rangeMatch, 2, chunk, 8, - _SIDD_UWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); - - // replace the non-Latin 1 characters in the chunk with question marks - chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); -# else - // SSE has no compare instruction for unsigned comparison. - // The variables must be shiffted + 0x8000 to be compared - const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000)); - const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000)); - - const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset); - const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); - -# ifdef __SSE4_1__ - // replace the non-Latin 1 characters in the chunk with question marks - chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); -# else - // offLimitQuestionMark contains '?' for each 16 bits that was off-limit - // the 16 bits that were correct contains zeros - const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); - - // correctBytes contains the bytes that were in limit - // the 16 bits that were off limits contains zeros - const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk); - - // merge offLimitQuestionMark and correctBytes to have the result - chunk = _mm_or_si128(correctBytes, offLimitQuestionMark); -# endif -# endif - return chunk; -} -#endif - template static void qt_to_latin1_internal(uchar *dst, const ushort *src, qsizetype length) { @@ -598,6 +543,40 @@ static void qt_to_latin1_internal(uchar *dst, const ushort *src, qsizetype lengt uchar *e = dst + length; qptrdiff offset = 0; + const __m128i questionMark = _mm_set1_epi16('?'); + const __m128i outOfRange = _mm_set1_epi16(0x100); + + auto mergeQuestionMarks = [=](__m128i chunk) { + // SSE has no compare instruction for unsigned comparison. +# ifdef __SSE4_1__ + // We use an unsigned uc = qMin(uc, 0x100) and then compare for equality. + chunk = _mm_min_epu16(chunk, outOfRange); + const __m128i offLimitMask = _mm_cmpeq_epi16(chunk, outOfRange); + chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); +# else + // The variables must be shiffted + 0x8000 to be compared + const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000)); + const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000)); + + const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset); + const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); + + // offLimitQuestionMark contains '?' for each 16 bits that was off-limit + // the 16 bits that were correct contains zeros + const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); + + // correctBytes contains the bytes that were in limit + // the 16 bits that were off limits contains zeros + const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk); + + // merge offLimitQuestionMark and correctBytes to have the result + chunk = _mm_or_si128(correctBytes, offLimitQuestionMark); + + Q_UNUSED(outOfRange); +# endif + return chunk; + }; + // we're going to write to dst[offset..offset+15] (16 bytes) for ( ; dst + offset + 15 < e; offset += 16) { __m128i chunk1 = _mm_loadu_si128((const __m128i*)(src + offset)); // load