QString::toLatin1: stop using PCMPESTRM

This instruction is somewhat slow and requires a lot of inputs to be correctly set. Instead, use the PMIN trick, which does have unsigned comparison support. This commit moves the helper function to a lambda inside qt_to_latin1, to make it easier to reuse the constants in the next commit and to avoid warnings of unused static functions. Change-Id: Ib48364abee9f464c96c6fffd152e9e84f4ad3ae8 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
2018-05-14 13:57:30 -07:00 · 2018-05-14 13:57:30 -07:00 · a9074779cf
commit a9074779cf
parent bd0905db95
1 changed files with 34 additions and 55 deletions
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@ -536,61 +536,6 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size) Q_DECL_NOTHROW
 #endif
 }

-#if defined(__SSE2__)
-static inline __m128i mergeQuestionMarks(__m128i chunk)
-{
-    const __m128i questionMark = _mm_set1_epi16('?');
-
-# ifdef __SSE4_2__
-    // compare the unsigned shorts for the range 0x0100-0xFFFF
-    // note on the use of _mm_cmpestrm:
-    //  The MSDN documentation online (http://technet.microsoft.com/en-us/library/bb514080.aspx)
-    //  says for range search the following:
-    //    For each character c in a, determine whether b0 <= c <= b1 or b2 <= c <= b3
-    //
-    //  However, all examples on the Internet, including from Intel
-    //  (see http://software.intel.com/en-us/articles/xml-parsing-accelerator-with-intel-streaming-simd-extensions-4-intel-sse4/)
-    //  put the range to be searched first
-    //
-    //  Disassembly and instruction-level debugging with GCC and ICC show
-    //  that they are doing the right thing. Inverting the arguments in the
-    //  instruction does cause a bunch of test failures.
-
-    const __m128i rangeMatch = _mm_cvtsi32_si128(0xffff0100);
-    const __m128i offLimitMask = _mm_cmpestrm(rangeMatch, 2, chunk, 8,
-            _SIDD_UWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK);
-
-    // replace the non-Latin 1 characters in the chunk with question marks
-    chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
-# else
-    // SSE has no compare instruction for unsigned comparison.
-    // The variables must be shiffted + 0x8000 to be compared
-    const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000));
-    const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000));
-
-    const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset);
-    const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
-
-#  ifdef __SSE4_1__
-    // replace the non-Latin 1 characters in the chunk with question marks
-    chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
-#  else
-    // offLimitQuestionMark contains '?' for each 16 bits that was off-limit
-    // the 16 bits that were correct contains zeros
-    const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
-
-    // correctBytes contains the bytes that were in limit
-    // the 16 bits that were off limits contains zeros
-    const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk);
-
-    // merge offLimitQuestionMark and correctBytes to have the result
-    chunk = _mm_or_si128(correctBytes, offLimitQuestionMark);
-#  endif
-# endif
-    return chunk;
-}
-#endif
-
 template <bool Checked>
 static void qt_to_latin1_internal(uchar *dst, const ushort *src, qsizetype length)
 {
@ -598,6 +543,40 @@ static void qt_to_latin1_internal(uchar *dst, const ushort *src, qsizetype lengt
    uchar *e = dst + length;
    qptrdiff offset = 0;

+    const __m128i questionMark = _mm_set1_epi16('?');
+    const __m128i outOfRange = _mm_set1_epi16(0x100);
+
+    auto mergeQuestionMarks = [=](__m128i chunk) {
+        // SSE has no compare instruction for unsigned comparison.
+# ifdef __SSE4_1__
+        // We use an unsigned uc = qMin(uc, 0x100) and then compare for equality.
+        chunk = _mm_min_epu16(chunk, outOfRange);
+        const __m128i offLimitMask = _mm_cmpeq_epi16(chunk, outOfRange);
+        chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
+# else
+        // The variables must be shiffted + 0x8000 to be compared
+        const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000));
+        const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000));
+
+        const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset);
+        const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
+
+        // offLimitQuestionMark contains '?' for each 16 bits that was off-limit
+        // the 16 bits that were correct contains zeros
+        const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
+
+        // correctBytes contains the bytes that were in limit
+        // the 16 bits that were off limits contains zeros
+        const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk);
+
+        // merge offLimitQuestionMark and correctBytes to have the result
+        chunk = _mm_or_si128(correctBytes, offLimitQuestionMark);
+
+        Q_UNUSED(outOfRange);
+# endif
+        return chunk;
+    };
+
    // we're going to write to dst[offset..offset+15] (16 bytes)
    for ( ; dst + offset + 15 < e; offset += 16) {
        __m128i chunk1 = _mm_loadu_si128((const __m128i*)(src + offset)); // load