Add support for UTF-8 encoding/decoding with SIMD

Decoding from UTF-8 is easy: if the high bit is set, we fall back to the byte-by-byte decoding. Encoding to UTF-8 requires a little bit more work: to detect anything between 0x0080 and 0xffff, we have several options but none as easy as above. Multiple alternatives are in the benchmark code. In both loops, we do two things once we run into a non-ASCII character: first, we continue the loop for the remainder of ASCII characters in the buffer (which we can tell by checking the bits set in the mask), then we find the last non-ASCII character in that 16-character group, so we don't reenter the SSE code too soon. For the UTF-8 encoding, I have chosen the alternative that results in the best performance. It's closely tied to the alternative running the PMIN instruction, but that requires SSE 4.1. It's not worth the complexity. And quite counter-intuitively, the dedicated string instruction from SSE 4.2 performs most poorly of all solutions. This begs re-visiting the performance of the toLatin1 encoder. The best of 10 benchmark runs of this code were measured on my SandyBridge CPU @ 2.66 GHz (turbo @ 3.3 GHz), both as CPU cycles and as CPU ticks: Compared to: ICU Qt 4.7 non-SSE Qt 5.3 Data set fromUtf8 toUtf8 fromUtf8 toUtf8 fromUtf8 toUtf8 ASCII only 7.50x 6.22x 6.94x 7.60x 4.45x 4.90x 2-char UTF-8 1.17x 1.33x 1.64x 1.56x 1.01x 1.02x 3-char UTF-8 1.08x 1.18x 1.48x 1.33x 0.97x 0.92x 4-char UTF-8 1.05x 1.19x 1.20x 1.21x 0.97x 0.97x Creator data 3.62x 2.16x 2.60x 1.25x 1.78x 1.23x As shown by the numbers, the SSE-based code is slightly worse than the non-SSE code for dense non-ASCII strings. However, as evident in the Qt Creator data, most strings manipulated by applications are either pure ASCII or mostly so, so there's a net gain. Done-with: H. Peter Anvin <hpa@linux.intel.com> Change-Id: Ia74fbdfdcd7b088f6cba5048c03a153c01f5dbc1 Reviewed-by: Lars Knoll <lars.knoll@digia.com>
2013-10-26 15:53:40 -04:00 · 2013-10-26 15:53:40 -04:00 · 34821e226a
commit 34821e226a
parent bc91dc4895
2 changed files with 163 additions and 20 deletions
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@ -45,10 +45,97 @@
 #include "qendian.h"
 #include "qchar.h"

+#include "private/qsimd_p.h"
+
 QT_BEGIN_NAMESPACE

 enum { Endian = 0, Data = 1 };

+#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
+static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
+{
+    // do sixteen characters at a time
+    for ( ; end - src >= 16; src += 16, dst += 16) {
+        __m128i data1 = _mm_loadu_si128((__m128i*)src);
+        __m128i data2 = _mm_loadu_si128(1+(__m128i*)src);
+
+
+        // check if everything is ASCII
+        // the highest ASCII value is U+007F
+        // Do the packing directly:
+        // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
+        // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
+        // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
+        // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
+        // "non-ASCII", but it's an acceptable compromise.
+        __m128i packed = _mm_packus_epi16(data1, data2);
+        __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
+
+        // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
+        ushort n = ~_mm_movemask_epi8(nonAscii);
+        if (n) {
+            // copy the front part that is still ASCII
+            while (!(n & 1)) {
+                *dst++ = *src++;
+                n >>= 1;
+            }
+
+            // find the next probable ASCII character
+            // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
+            // characters still coming
+            n = _bit_scan_reverse(n);
+            nextAscii = src + n;
+            return false;
+        }
+
+        // pack
+        _mm_storeu_si128((__m128i*)dst, packed);
+    }
+    return src == end;
+}
+
+static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
+{
+    // do sixteen characters at a time
+    for ( ; end - src >= 16; src += 16, dst += 16) {
+        __m128i data = _mm_loadu_si128((__m128i*)src);
+
+        // check if everything is ASCII
+        // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
+        uint n = _mm_movemask_epi8(data);
+        if (n) {
+            // copy the front part that is still ASCII
+            while (!(n & 1)) {
+                *dst++ = *src++;
+                n >>= 1;
+            }
+
+            // find the next probable ASCII character
+            // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+            // characters still coming
+            n = _bit_scan_reverse(n);
+            nextAscii = src + n;
+            return false;
+        }
+
+        // unpack
+        _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
+        _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
+    }
+    return src == end;
+}
+#else
+static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *)
+{
+    return false;
+}
+
+static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *)
+{
+    return false;
+}
+#endif
+
 QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
 {
    // create a QByteArray with the worst case scenario size
@ -58,12 +145,18 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
    const ushort *const end = src + len;

    while (src != end) {
-        ushort uc = *src++;
-        int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
-        if (res < 0) {
-            // encoding error - append '?'
-            *dst++ = '?';
-        }
+        const ushort *nextAscii = end;
+        if (simdEncodeAscii(dst, nextAscii, src, end))
+            break;
+
+        do {
+            ushort uc = *src++;
+            int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
+            if (res < 0) {
+                // encoding error - append '?'
+                *dst++ = '?';
+            }
+        } while (src < nextAscii);
    }

    result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
@ -98,10 +191,21 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
        *cursor++ = 0xbf;
    }

+    const ushort *nextAscii = src;
    while (src != end) {
-        ushort uc = surrogate_high == -1 ? *src++ : surrogate_high;
-        surrogate_high = -1;
-        int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+        int res;
+        ushort uc;
+        if (surrogate_high != -1) {
+            uc = surrogate_high;
+            surrogate_high = -1;
+            res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+        } else {
+            if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end))
+                break;
+
+            uc = *src++;
+            res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+        }
        if (Q_LIKELY(res >= 0))
            continue;

@ -136,12 +240,18 @@ QString QUtf8::convertToUnicode(const char *chars, int len)
    const uchar *end = src + len;

    while (src < end) {
-        uchar b = *src++;
-        int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
-        if (res < 0) {
-            // decoding error
-            *dst++ = QChar::ReplacementCharacter;
-        }
+        const uchar *nextAscii = end;
+        if (simdDecodeAscii(dst, nextAscii, src, end))
+            break;
+
+        do {
+            uchar b = *src++;
+            int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+            if (res < 0) {
+                // decoding error
+                *dst++ = QChar::ReplacementCharacter;
+            }
+        } while (src < nextAscii);
    }

    result.truncate(dst - reinterpret_cast<const ushort *>(result.constData()));
@ -204,7 +314,11 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte

    // main body, stateless decoding
    res = 0;
+    const uchar *nextAscii = src;
    while (res >= 0 && src < end) {
+        if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
+            break;
+
        ch = *src++;
        res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
        if (!headerdone && res >= 0) {
--- a/src/corelib/tools/qsimd_p.h
+++ b/src/corelib/tools/qsimd_p.h
@ -72,7 +72,7 @@
 * I = intrinsics; C = code generation
 */

-#ifdef __MINGW64_VERSION_MAJOR
+#if defined(__MINGW64_VERSION_MAJOR) || (defined(Q_CC_MSVC) && !defined(Q_OS_WINCE))
 #include <intrin.h>
 #endif

@ -139,10 +139,15 @@
 #endif

 // other x86 intrinsics
-#if defined(QT_COMPILER_SUPPORTS_AVX) && defined(Q_CC_GNU) && \
-    (!defined(Q_CC_INTEL)|| __INTEL_COMPILER >= 1310 || (__GNUC__ * 100 + __GNUC_MINOR__ < 407))
-#define QT_COMPILER_SUPPORTS_X86INTRIN
-#include <x86intrin.h>
+#if defined(Q_PROCESSOR_X86) && ((defined(Q_CC_GNU) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 404)) \
+    || (defined(Q_CC_CLANG) && (__clang_major__ * 100 + __clang_minor__ >= 208)) \
+    || defined(Q_CC_INTEL))
+#  define QT_COMPILER_SUPPORTS_X86INTRIN
+#  ifndef Q_CC_INTEL
+// The Intel compiler has no <x86intrin.h> -- all intrinsics are in <immintrin.h>;
+// GCC 4.4 and Clang 2.8 added a few more intrinsics there
+#    include <x86intrin.h>
+#  endif
 #endif

 // NEON intrinsics
@ -241,6 +246,30 @@ static inline uint qCpuFeatures()

 #define qCpuHasFeature(feature)  ((qCompilerCpuFeatures & (feature)) || (qCpuFeatures() & (feature)))

+#ifdef Q_PROCESSOR_X86
+// Bit scan functions for x86
+#  ifdef Q_CC_MSVC
+// MSVC calls it _BitScanReverse and returns the carry flag, which we don't need
+static __forceinline unsigned long _bit_scan_reverse(uint val)
+{
+    unsigned long result;
+    _BitScanReverse(&result, val);
+    return result;
+}
+#  elif (defined(Q_CC_CLANG) || (defined(Q_CC_GNU) && __GNUC__ * 100 + __GNUC_MINOR__ < 405)) \
+    && !defined(Q_CC_INTEL)
+// Clang is missing the intrinsic for _bit_scan_reverse
+// GCC only added it in version 4.5
+static inline __attribute__((always_inline))
+unsigned _bit_scan_reverse(unsigned val)
+{
+    unsigned result;
+    asm("bsr %1, %0" : "=r" (result) : "r" (val));
+    return result;
+}
+#  endif
+#endif // Q_PROCESSOR_X86
+
 #define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
    for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)