QString: replace #if with if constexpr for the x86 ISA extensions

Because it looks like it works in all our compilers and gets us at least basic syntax checking. This is important for me because I usually compile with a very high -march= flag (currently, tigerlake) and would not see any syntax errors I may introduce. Whenever possible, this uses a return inside the if constexpr and then has the fallback code simply eliminated, but still fully compiled. Almost all changes are basic reindentation, except for a few variables that must be declared above the block in question, plus some shenanigans with the variables in qt_to_latin1_internal(): we want to avoid emitting multiple loads for the same constants, so we force the compiler to reuse the shrunk version of the 256-bit registers. The non-AVX2 code will receive a few minor face-lifts in the next few commits. Those don't intend to actually improve performance by much. The real work will happen in the AVX2 code later. I dropped the -Os code because I don't have the spare time to test it and maintain it; if there's interest, we can simply disable the entire set vector optimization set. Change-Id: Ib42b3adc93bf4d43bd55fffd16c1128c1a4d4875 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io> Reviewed-by: Lars Knoll <lars@knoll.priv.no>
2021-12-15 20:48:52 -03:00 · 2021-12-15 20:48:52 -03:00 · 0cd105079f
commit 0cd105079f
parent 9d27c07e28
1 changed files with 191 additions and 170 deletions
--- a/src/corelib/text/qstring.cpp
+++ b/src/corelib/text/qstring.cpp
@ -349,7 +349,12 @@ extern "C" void qt_fromlatin1_mips_asm_unroll8 (char16_t*, const char*, uint);
 extern "C" void qt_toLatin1_mips_dsp_asm(uchar *dst, const char16_t *src, int length);
 #endif

-#if defined(__SSE2__) && defined(Q_CC_GNU)
+#ifdef __SSE2__
+static constexpr bool UseAvx2 =
+        (qCompilerCpuFeatures & CpuFeatureArchHaswell) == CpuFeatureArchHaswell;
+#endif
+
+#ifdef Q_CC_GNU
 #  if defined(__SANITIZE_ADDRESS__) && Q_CC_GNU < 800 && !defined(Q_CC_CLANG)
 #     warning "The __attribute__ on below will likely cause a build failure with your GCC version. Your choices are:"
 #     warning "1) disable ASan;"
@ -467,23 +472,24 @@ const char16_t *QtPrivate::qustrchr(QStringView str, char16_t c) noexcept
    bool loops = true;
    // Using the PMOVMSKB instruction, we get two bits for each character
    // we compare.
-#  if defined(__AVX2__) && !defined(__OPTIMIZE_SIZE__)
-    // we're going to read n[0..15] (32 bytes)
-    __m256i mch256 = _mm256_set1_epi32(c | (c << 16));
-    for (const char16_t *next = n + 16; next <= e; n = next, next += 16) {
-        __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
-        __m256i result = _mm256_cmpeq_epi16(data, mch256);
-        uint mask = uint(_mm256_movemask_epi8(result));
-        if (mask) {
-            uint idx = qCountTrailingZeroBits(mask);
-            return n + idx / 2;
+    __m128i mch;
+    if constexpr (UseAvx2) {
+        // we're going to read n[0..15] (32 bytes)
+        __m256i mch256 = _mm256_set1_epi32(c | (c << 16));
+        for (const char16_t *next = n + 16; next <= e; n = next, next += 16) {
+            __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
+            __m256i result = _mm256_cmpeq_epi16(data, mch256);
+            uint mask = uint(_mm256_movemask_epi8(result));
+            if (mask) {
+                uint idx = qCountTrailingZeroBits(mask);
+                return n + idx / 2;
+            }
        }
+        loops = false;
+        mch = _mm256_castsi256_si128(mch256);
+    } else {
+        mch = _mm_set1_epi32(c | (c << 16));
    }
-    loops = false;
-    __m128i mch = _mm256_castsi256_si128(mch256);
-#  else
-    __m128i mch = _mm_set1_epi32(c | (c << 16));
-#  endif

    auto hasMatch = [mch, &n](__m128i data, ushort validityMask) {
        __m128i result = _mm_cmpeq_epi16(data, mch);
@ -551,64 +557,66 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval)
        return false;
    };

-#  if defined(__SSE4_1__)
-    __m128i mask;
-    auto updatePtrSimd = [&](__m128i data) {
-        __m128i masked = _mm_and_si128(mask, data);
-        __m128i comparison = _mm_cmpeq_epi16(masked, _mm_setzero_si128());
-        uint result = _mm_movemask_epi8(comparison);
-        return updatePtr(result);
-    };
+    if constexpr (qCompilerCpuFeatures & CpuFeatureSSE4_1) {
+        __m128i mask;
+        auto updatePtrSimd = [&](__m128i data) {
+            __m128i masked = _mm_and_si128(mask, data);
+            __m128i comparison = _mm_cmpeq_epi16(masked, _mm_setzero_si128());
+            uint result = _mm_movemask_epi8(comparison);
+            return updatePtr(result);
+        };

-#    if defined(__AVX2__)
-    // AVX2 implementation: test 32 bytes at a time
-    const __m256i mask256 = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(maskval));
-    while (ptr + 32 <= end) {
-        __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
-        if (!_mm256_testz_si256(mask256, data)) {
-            // found a character matching the mask
-            __m256i masked256 = _mm256_and_si256(mask256, data);
-            __m256i comparison256 = _mm256_cmpeq_epi16(masked256, _mm256_setzero_si256());
-            return updatePtr(_mm256_movemask_epi8(comparison256));
+        if constexpr (UseAvx2) {
+            // AVX2 implementation: test 32 bytes at a time
+            const __m256i mask256 = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(maskval));
+            while (ptr + 32 <= end) {
+                __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
+                if (!_mm256_testz_si256(mask256, data)) {
+                    // found a character matching the mask
+                    __m256i masked256 = _mm256_and_si256(mask256, data);
+                    __m256i comparison256 = _mm256_cmpeq_epi16(masked256, _mm256_setzero_si256());
+                    return updatePtr(_mm256_movemask_epi8(comparison256));
+                }
+                ptr += 32;
+            }
+
+            mask = _mm256_castsi256_si128(mask256);
+        } else {
+            // SSE 4.1 implementation: test 32 bytes at a time (two 16-byte
+            // comparisons, unrolled)
+            mask = _mm_set1_epi32(maskval);
+            while (ptr + 32 <= end) {
+                __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
+                __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
+                if (!_mm_testz_si128(mask, data1))
+                    return updatePtrSimd(data1);
+
+                ptr += 16;
+                if (!_mm_testz_si128(mask, data2))
+                    return updatePtrSimd(data2);
+                ptr += 16;
+            }
        }
-        ptr += 32;
+
+        // AVX2 and SSE4.1: final 16-byte comparison
+        if (ptr + 16 <= end) {
+            __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
+            if (!_mm_testz_si128(mask, data1))
+                return updatePtrSimd(data1);
+            ptr += 16;
+        }
+
+        // and final 8-byte comparison
+        if (ptr + 8 <= end) {
+            __m128i data1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr));
+            if (!_mm_testz_si128(mask, data1))
+                return updatePtrSimd(data1);
+            ptr += 8;
+        }
+
+        return true;
    }

-    mask = _mm256_castsi256_si128(mask256);
-#    else
-    // SSE 4.1 implementation: test 32 bytes at a time (two 16-byte
-    // comparisons, unrolled)
-    mask = _mm_set1_epi32(maskval);
-    while (ptr + 32 <= end) {
-        __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
-        __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
-        if (!_mm_testz_si128(mask, data1))
-            return updatePtrSimd(data1);
-
-        ptr += 16;
-        if (!_mm_testz_si128(mask, data2))
-            return updatePtrSimd(data2);
-        ptr += 16;
-    }
-#    endif
-
-    // AVX2 and SSE4.1: final 16-byte comparison
-    if (ptr + 16 <= end) {
-        __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
-        if (!_mm_testz_si128(mask, data1))
-            return updatePtrSimd(data1);
-        ptr += 16;
-    }
-
-    // and final 8-byte comparison
-    if (ptr + 8 <= end) {
-        __m128i data1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr));
-        if (!_mm_testz_si128(mask, data1))
-            return updatePtrSimd(data1);
-        ptr += 8;
-    }
-
-#  else
    // SSE2 implementation: test 16 bytes at a time.
    const __m128i mask = _mm_set1_epi32(maskval);
    while (ptr + 16 <= end) {
@ -631,7 +639,6 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval)
            return updatePtr(result);
        ptr += 8;
    }
-#  endif

    return true;
 }
@ -639,16 +646,16 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval)
 static Q_ALWAYS_INLINE __m128i mm_load8_zero_extend(const void *ptr)
 {
    const __m128i *dataptr = static_cast<const __m128i *>(ptr);
-#if defined(__SSE4_1__)
-    // use a MOVQ followed by PMOVZXBW
-    // if AVX2 is present, these should combine into a single VPMOVZXBW instruction
-    __m128i data = _mm_loadl_epi64(dataptr);
-    return _mm_cvtepu8_epi16(data);
-#  else
+    if constexpr (qCompilerCpuFeatures & CpuFeatureSSE4_1) {
+        // use a MOVQ followed by PMOVZXBW
+        // if AVX2 is present, these should combine into a single VPMOVZXBW instruction
+        __m128i data = _mm_loadl_epi64(dataptr);
+        return _mm_cvtepu8_epi16(data);
+    }
+
    // use MOVQ followed by PUNPCKLBW
    __m128i data = _mm_loadl_epi64(dataptr);
    return _mm_unpacklo_epi8(data, _mm_setzero_si128());
-#  endif
 }
 #endif

@ -659,19 +666,19 @@ bool qt_is_ascii(const char *&ptr, const char *end) noexcept
 #if defined(__SSE2__)
    // Testing for the high bit can be done efficiently with just PMOVMSKB
    bool loops = true;
-#  if defined(__AVX2__)
-    while (ptr + 32 <= end) {
-        __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
-        quint32 mask = _mm256_movemask_epi8(data);
-        if (mask) {
-            uint idx = qCountTrailingZeroBits(mask);
-            ptr += idx;
-            return false;
+    if constexpr (UseAvx2) {
+        while (ptr + 32 <= end) {
+            __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
+            quint32 mask = _mm256_movemask_epi8(data);
+            if (mask) {
+                uint idx = qCountTrailingZeroBits(mask);
+                ptr += idx;
+                return false;
+            }
+            ptr += 32;
        }
-        ptr += 32;
+        loops = false;
    }
-    loops = false;
-#  endif

    while (ptr + 16 <= end) {
        __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
@ -802,23 +809,23 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
    // we're going to read str[offset..offset+15] (16 bytes)
    for ( ; str + offset + 15 < e; offset += 16) {
        const __m128i chunk = _mm_loadu_si128((const __m128i*)(str + offset)); // load
-#ifdef __AVX2__
-        // zero extend to an YMM register
-        const __m256i extended = _mm256_cvtepu8_epi16(chunk);
+        if constexpr (UseAvx2) {
+            // zero extend to an YMM register
+            const __m256i extended = _mm256_cvtepu8_epi16(chunk);

-        // store
-        _mm256_storeu_si256((__m256i*)(dst + offset), extended);
-#else
-        const __m128i nullMask = _mm_set1_epi32(0);
+            // store
+            _mm256_storeu_si256((__m256i*)(dst + offset), extended);
+        } else {
+            const __m128i nullMask = _mm_set1_epi32(0);

-        // unpack the first 8 bytes, padding with zeros
-        const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
-        _mm_storeu_si128((__m128i*)(dst + offset), firstHalf); // store
+            // unpack the first 8 bytes, padding with zeros
+            const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
+            _mm_storeu_si128((__m128i*)(dst + offset), firstHalf); // store

-        // unpack the last 8 bytes, padding with zeros
-        const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
-        _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store
-#endif
+            // unpack the last 8 bytes, padding with zeros
+            const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
+            _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store
+        }
    }

    // we're going to read str[offset..offset+7] (8 bytes)
@ -855,24 +862,36 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len
    uchar *e = dst + length;
    qptrdiff offset = 0;

-#  ifdef __AVX2__
-    const __m256i questionMark256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128('?'));
-    const __m256i outOfRange256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(0x100));
-    const __m128i questionMark = _mm256_castsi256_si128(questionMark256);
-    const __m128i outOfRange = _mm256_castsi256_si128(outOfRange256);
-#  else
-    const __m128i questionMark = _mm_set1_epi16('?');
-    const __m128i outOfRange = _mm_set1_epi16(0x100);
-#  endif
+    auto questionMark256 = []() {
+        if constexpr (UseAvx2)
+            return _mm256_broadcastw_epi16(_mm_cvtsi32_si128('?'));
+        else
+            return 0;
+    }();
+    auto outOfRange256 = []() {
+        if constexpr (UseAvx2)
+            return _mm256_broadcastw_epi16(_mm_cvtsi32_si128(0x100));
+        else
+            return 0;
+    }();
+    __m128i questionMark, outOfRange;
+    if constexpr (UseAvx2) {
+        questionMark = _mm256_castsi256_si128(questionMark256);
+        outOfRange = _mm256_castsi256_si128(outOfRange256);
+    } else {
+        questionMark = _mm_set1_epi16('?');
+        outOfRange = _mm_set1_epi16(0x100);
+    }

    auto mergeQuestionMarks = [=](__m128i chunk) {
        // SSE has no compare instruction for unsigned comparison.
-# ifdef __SSE4_1__
-        // We use an unsigned uc = qMin(uc, 0x100) and then compare for equality.
-        chunk = _mm_min_epu16(chunk, outOfRange);
-        const __m128i offLimitMask = _mm_cmpeq_epi16(chunk, outOfRange);
-        chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
-# else
+        if constexpr (qCompilerCpuFeatures & CpuFeatureSSE4_1) {
+            // We use an unsigned uc = qMin(uc, 0x100) and then compare for equality.
+            chunk = _mm_min_epu16(chunk, outOfRange);
+            const __m128i offLimitMask = _mm_cmpeq_epi16(chunk, outOfRange);
+            chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
+            return chunk;
+        }
        // The variables must be shiffted + 0x8000 to be compared
        const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000));
        const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000));
@ -892,33 +911,33 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len
        chunk = _mm_or_si128(correctBytes, offLimitQuestionMark);

        Q_UNUSED(outOfRange);
-# endif
        return chunk;
    };

    // we're going to write to dst[offset..offset+15] (16 bytes)
    for ( ; dst + offset + 15 < e; offset += 16) {
-#  if defined(__AVX2__)
-        __m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + offset));
-        if (Checked) {
-            // See mergeQuestionMarks lambda above for details
-            chunk = _mm256_min_epu16(chunk, outOfRange256);
-            const __m256i offLimitMask = _mm256_cmpeq_epi16(chunk, outOfRange256);
-            chunk = _mm256_blendv_epi8(chunk, questionMark256, offLimitMask);
+        __m128i chunk1, chunk2;
+        if constexpr (UseAvx2) {
+            __m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + offset));
+            if (Checked) {
+                // See mergeQuestionMarks lambda above for details
+                chunk = _mm256_min_epu16(chunk, outOfRange256);
+                const __m256i offLimitMask = _mm256_cmpeq_epi16(chunk, outOfRange256);
+                chunk = _mm256_blendv_epi8(chunk, questionMark256, offLimitMask);
+            }
+
+            chunk2 = _mm256_extracti128_si256(chunk, 1);
+            chunk1 = _mm256_castsi256_si128(chunk);
+        } else {
+            chunk1 = _mm_loadu_si128((const __m128i*)(src + offset)); // load
+            if (Checked)
+                chunk1 = mergeQuestionMarks(chunk1);
+
+            chunk2 = _mm_loadu_si128((const __m128i*)(src + offset + 8)); // load
+            if (Checked)
+                chunk2 = mergeQuestionMarks(chunk2);
        }

-        const __m128i chunk2 = _mm256_extracti128_si256(chunk, 1);
-        const __m128i chunk1 = _mm256_castsi256_si128(chunk);
-#  else
-        __m128i chunk1 = _mm_loadu_si128((const __m128i*)(src + offset)); // load
-        if (Checked)
-            chunk1 = mergeQuestionMarks(chunk1);
-
-        __m128i chunk2 = _mm_loadu_si128((const __m128i*)(src + offset + 8)); // load
-        if (Checked)
-            chunk2 = mergeQuestionMarks(chunk2);
-#  endif
-
        // pack the two vector to 16 x 8bits elements
        const __m128i result = _mm_packus_epi16(chunk1, chunk2);
        _mm_storeu_si128((__m128i*)(dst + offset), result); // store
@ -1131,20 +1150,21 @@ static int ucstrncmp(const char16_t *a, const char16_t *b, size_t l)

    // we're going to read a[0..15] and b[0..15] (32 bytes)
    for ( ; end - a >= offset + 16; offset += 16) {
-#ifdef __AVX2__
-        __m256i a_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(a + offset));
-        __m256i b_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(b + offset));
-        __m256i result = _mm256_cmpeq_epi16(a_data, b_data);
-        uint mask = _mm256_movemask_epi8(result);
-#else
-        __m128i a_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset));
-        __m128i a_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset + 8));
-        __m128i b_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset));
-        __m128i b_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset + 8));
-        __m128i result1 = _mm_cmpeq_epi16(a_data1, b_data1);
-        __m128i result2 = _mm_cmpeq_epi16(a_data2, b_data2);
-        uint mask = _mm_movemask_epi8(result1) | (_mm_movemask_epi8(result2) << 16);
-#endif
+        uint mask;
+        if constexpr (UseAvx2) {
+            __m256i a_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(a + offset));
+            __m256i b_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(b + offset));
+            __m256i result = _mm256_cmpeq_epi16(a_data, b_data);
+            mask = _mm256_movemask_epi8(result);
+        } else {
+            __m128i a_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset));
+            __m128i a_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset + 8));
+            __m128i b_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset));
+            __m128i b_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset + 8));
+            __m128i result1 = _mm_cmpeq_epi16(a_data1, b_data1);
+            __m128i result2 = _mm_cmpeq_epi16(a_data2, b_data2);
+            mask = _mm_movemask_epi8(result1) | (_mm_movemask_epi8(result2) << 16);
+        }
        mask = ~mask;
        if (mask) {
            // found a different character
@ -1252,30 +1272,31 @@ static int ucstrncmp(const char16_t *a, const char *b, size_t l)
    for ( ; uc + offset + 15 < e; offset += 16) {
        // similar to fromLatin1_helper:
        // load 16 bytes of Latin 1 data
+        uint mask;
        __m128i chunk = _mm_loadu_si128((const __m128i*)(c + offset));

-#  ifdef __AVX2__
-        // expand Latin 1 data via zero extension
-        __m256i ldata = _mm256_cvtepu8_epi16(chunk);
+        if constexpr (UseAvx2) {
+            // expand Latin 1 data via zero extension
+            __m256i ldata = _mm256_cvtepu8_epi16(chunk);

-        // load UTF-16 data and compare
-        __m256i ucdata = _mm256_loadu_si256((const __m256i*)(uc + offset));
-        __m256i result = _mm256_cmpeq_epi16(ldata, ucdata);
+            // load UTF-16 data and compare
+            __m256i ucdata = _mm256_loadu_si256((const __m256i*)(uc + offset));
+            __m256i result = _mm256_cmpeq_epi16(ldata, ucdata);

-        uint mask = ~_mm256_movemask_epi8(result);
-#  else
-        // expand via unpacking
-        __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullmask);
-        __m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask);
+            mask = ~_mm256_movemask_epi8(result);
+        } else {
+            // expand via unpacking
+            __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullmask);
+            __m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask);

-        // load UTF-16 data and compare
-        __m128i ucdata1 = _mm_loadu_si128((const __m128i*)(uc + offset));
-        __m128i ucdata2 = _mm_loadu_si128((const __m128i*)(uc + offset + 8));
-        __m128i result1 = _mm_cmpeq_epi16(firstHalf, ucdata1);
-        __m128i result2 = _mm_cmpeq_epi16(secondHalf, ucdata2);
+            // load UTF-16 data and compare
+            __m128i ucdata1 = _mm_loadu_si128((const __m128i*)(uc + offset));
+            __m128i ucdata2 = _mm_loadu_si128((const __m128i*)(uc + offset + 8));
+            __m128i result1 = _mm_cmpeq_epi16(firstHalf, ucdata1);
+            __m128i result2 = _mm_cmpeq_epi16(secondHalf, ucdata2);

-        uint mask = ~(_mm_movemask_epi8(result1) | _mm_movemask_epi8(result2) << 16);
-#  endif
+            mask = ~(_mm_movemask_epi8(result1) | _mm_movemask_epi8(result2) << 16);
+        }
        if (mask) {
            // found a different character
            if (Mode == CompareStringsForEquality)