Improve the code generation for the Latin1 codec

This change does not modify the actual algorithm implemented. It only updates the source code so that the code generation is more optimal: - change only one variable per loop (the "offset" variable) - unroll the tail expansion of the last 15 characters The Neon code for the toLatin1 codec most likely benefits from the unrolling of the tail too, but I can't verify that I haven't broken anything. Change-Id: I8a92fd3c1aa700e6f8b0c8ebdb1978ade394757f Reviewed-by: Olivier Goffart <ogoffart@woboq.com>
2014-01-16 15:25:50 -08:00 · 2014-01-16 15:25:50 -08:00 · f7308e007e
commit f7308e007e
parent ab3637dd67
1 changed files with 73 additions and 58 deletions
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@ -190,6 +190,16 @@ template <uint MaxCount> struct UnrollTailLoop

        return UnrollTailLoop<MaxCount - 1>::exec(count - 1, returnIfExited, loopCheck, returnIfFailed, i + 1);
    }
+
+    template <typename Functor>
+    static inline void exec(int count, Functor code)
+    {
+        /* equivalent to:
+         *   for (int i = 0; i < count; ++i)
+         *       code(i);
+         */
+        exec(count, 0, [=](int i) -> bool { code(i); return false; }, [](int) { return 0; });
+    }
 };
 template <> template <typename RetType, typename Functor1, typename Functor2>
 inline RetType UnrollTailLoop<0>::exec(int, RetType returnIfExited, Functor1, Functor2, int)
@ -207,25 +217,29 @@ static void qt_from_latin1(ushort *dst, const char *str, size_t size)
     * The same method gives no improvement with NEON.
     */
 #if defined(__SSE2__)
-    if (size >= 16) {
-        int chunkCount = size >> 4; // divided by 16
+    const char *e = str + size;
+    qptrdiff offset = 0;
+
+    // we're going to read str[offset..offset+15] (16 bytes)
+    for ( ; str + offset + 15 < e; offset += 16) {
        const __m128i nullMask = _mm_set1_epi32(0);
-        for (int i = 0; i < chunkCount; ++i) {
-            const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load
-            str += 16;
+        const __m128i chunk = _mm_loadu_si128((__m128i*)(str + offset)); // load

-            // unpack the first 8 bytes, padding with zeros
-            const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
-            _mm_storeu_si128((__m128i*)dst, firstHalf); // store
-            dst += 8;
+        // unpack the first 8 bytes, padding with zeros
+        const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
+        _mm_storeu_si128((__m128i*)(dst + offset), firstHalf); // store

-            // unpack the last 8 bytes, padding with zeros
-            const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
-            _mm_storeu_si128((__m128i*)dst, secondHalf); // store
-            dst += 8;
-        }
-        size = size % 16;
+        // unpack the last 8 bytes, padding with zeros
+        const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
+        _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store
    }
+
+    size = size % 16;
+    dst += offset;
+    str += offset;
+#  ifdef Q_COMPILER_LAMBDA
+    return UnrollTailLoop<15>::exec(size, [=](int i) { dst[i] = (uchar)str[i]; });
+#  endif
 #endif
 #if defined(__mips_dsp)
    if (size > 20)
@ -295,61 +309,62 @@ static inline __m128i mergeQuestionMarks(__m128i chunk)

 static void qt_to_latin1(uchar *dst, const ushort *src, int length)
 {
-    if (length) {
 #if defined(__SSE2__)
-        if (length >= 16) {
-            const int chunkCount = length >> 4; // divided by 16
+    uchar *e = dst + length;
+    qptrdiff offset = 0;

-            for (int i = 0; i < chunkCount; ++i) {
-                __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load
-                chunk1 = mergeQuestionMarks(chunk1);
-                src += 8;
+    // we're going to write to dst[offset..offset+15] (16 bytes)
+    for ( ; dst + offset + 15 < e; offset += 16) {
+        __m128i chunk1 = _mm_loadu_si128((__m128i*)(src + offset)); // load
+        chunk1 = mergeQuestionMarks(chunk1);

-                __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load
-                chunk2 = mergeQuestionMarks(chunk2);
-                src += 8;
+        __m128i chunk2 = _mm_loadu_si128((__m128i*)(src + offset + 8)); // load
+        chunk2 = mergeQuestionMarks(chunk2);

-                // pack the two vector to 16 x 8bits elements
-                const __m128i result = _mm_packus_epi16(chunk1, chunk2);
+        // pack the two vector to 16 x 8bits elements
+        const __m128i result = _mm_packus_epi16(chunk1, chunk2);
+        _mm_storeu_si128((__m128i*)(dst + offset), result); // store
+    }

-                _mm_storeu_si128((__m128i*)dst, result); // store
-                dst += 16;
-            }
-            length = length % 16;
-        }
+    length = length % 16;
+    dst += offset;
+    src += offset;
+
+#  ifdef Q_COMPILER_LAMBDA
+    return UnrollTailLoop<15>::exec(length, [=](int i) { dst[i] = (src[i]>0xff) ? '?' : (uchar) src[i]; });
+#  endif
 #elif defined(__ARM_NEON__)
-        // Refer to the documentation of the SSE2 implementation
-        // this use eactly the same method as for SSE except:
-        // 1) neon has unsigned comparison
-        // 2) packing is done to 64 bits (8 x 8bits component).
-        if (length >= 16) {
-            const int chunkCount = length >> 3; // divided by 8
-            const uint16x8_t questionMark = vdupq_n_u16('?'); // set
-            const uint16x8_t thresholdMask = vdupq_n_u16(0xff); // set
-            for (int i = 0; i < chunkCount; ++i) {
-                uint16x8_t chunk = vld1q_u16((uint16_t *)src); // load
-                src += 8;
+    // Refer to the documentation of the SSE2 implementation
+    // this use eactly the same method as for SSE except:
+    // 1) neon has unsigned comparison
+    // 2) packing is done to 64 bits (8 x 8bits component).
+    if (length >= 16) {
+        const int chunkCount = length >> 3; // divided by 8
+        const uint16x8_t questionMark = vdupq_n_u16('?'); // set
+        const uint16x8_t thresholdMask = vdupq_n_u16(0xff); // set
+        for (int i = 0; i < chunkCount; ++i) {
+            uint16x8_t chunk = vld1q_u16((uint16_t *)src); // load
+            src += 8;

-                const uint16x8_t offLimitMask = vcgtq_u16(chunk, thresholdMask); // chunk > thresholdMask
-                const uint16x8_t offLimitQuestionMark = vandq_u16(offLimitMask, questionMark); // offLimitMask & questionMark
-                const uint16x8_t correctBytes = vbicq_u16(chunk, offLimitMask); // !offLimitMask & chunk
-                chunk = vorrq_u16(correctBytes, offLimitQuestionMark); // correctBytes | offLimitQuestionMark
-                const uint8x8_t result = vmovn_u16(chunk); // narrowing move->packing
-                vst1_u8(dst, result); // store
-                dst += 8;
-            }
-            length = length % 8;
+            const uint16x8_t offLimitMask = vcgtq_u16(chunk, thresholdMask); // chunk > thresholdMask
+            const uint16x8_t offLimitQuestionMark = vandq_u16(offLimitMask, questionMark); // offLimitMask & questionMark
+            const uint16x8_t correctBytes = vbicq_u16(chunk, offLimitMask); // !offLimitMask & chunk
+            chunk = vorrq_u16(correctBytes, offLimitQuestionMark); // correctBytes | offLimitQuestionMark
+            const uint8x8_t result = vmovn_u16(chunk); // narrowing move->packing
+            vst1_u8(dst, result); // store
+            dst += 8;
        }
+        length = length % 8;
+    }
 #endif
 #if defined(__mips_dsp)
-        qt_toLatin1_mips_dsp_asm(dst, src, length);
+    qt_toLatin1_mips_dsp_asm(dst, src, length);
 #else
-        while (length--) {
-            *dst++ = (*src>0xff) ? '?' : (uchar) *src;
-            ++src;
-        }
-#endif
+    while (length--) {
+        *dst++ = (*src>0xff) ? '?' : (uchar) *src;
+        ++src;
    }
+#endif
 }

 // Unicode case-insensitive comparison