From 309d3557ca832c42b8fbd372b957af51510b159e Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Sun, 26 Jan 2014 23:45:27 -0800 Subject: [PATCH] Improve a few string operations with AVX2 AVX2 brings the new PMOVZXBW instruction that extends from one 128-bit SSE register to an 256-bit AVX register. With that, the main decoding code is just two instructions (the loop requires a couple more to maintain the offset counter and do the end-of-loop check). This buys us another 4% performance improvement in the fromLatin1 code, calculated on top of the VEX-encoded SSE2 code (which is already a little better than plain SSE2). Change-Id: I675fa24de4fa97683b662f19d146047251f77359 Reviewed-by: Allan Sandfeld Jensen --- src/corelib/codecs/qutfcodec.cpp | 47 ++++++++++++++++++++++---------- src/corelib/tools/qstring.cpp | 25 +++++++++++++++-- 2 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 072cda63aa..c0f26ad803 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -103,27 +103,44 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const for ( ; end - src >= 16; src += 16, dst += 16) { __m128i data = _mm_loadu_si128((__m128i*)src); +#ifdef __AVX2__ + const int BitSpacing = 2; + // load and zero extend to an YMM register + const __m256i extended = _mm256_cvtepu8_epi16(data); + + uint n = _mm256_movemask_epi8(extended); + if (!n) { + // store + _mm256_storeu_si256((__m256i*)dst, extended); + continue; + } +#else + const int BitSpacing = 1; + // check if everything is ASCII // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII uint n = _mm_movemask_epi8(data); - if (n) { - // copy the front part that is still ASCII - while (!(n & 1)) { - *dst++ = *src++; - n >>= 1; - } + if (!n) { + // unpack + _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128())); + _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128())); + continue; + } +#endif - // find the next probable ASCII character - // we don't want to load 16 bytes again in this loop if we know there are non-ASCII - // characters still coming - n = _bit_scan_reverse(n); - nextAscii = src + n + 1; - return false; + // copy the front part that is still ASCII + while (!(n & 1)) { + *dst++ = *src++; + n >>= BitSpacing; } - // unpack - _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128())); - _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128())); + // find the next probable ASCII character + // we don't want to load 16 bytes again in this loop if we know there are non-ASCII + // characters still coming + n = _bit_scan_reverse(n); + nextAscii = src + (n / BitSpacing) + 1; + return false; + } return src == end; } diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 541a853487..a7d516a726 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -223,8 +223,15 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size) // we're going to read str[offset..offset+15] (16 bytes) for ( ; str + offset + 15 < e; offset += 16) { - const __m128i nullMask = _mm_set1_epi32(0); const __m128i chunk = _mm_loadu_si128((__m128i*)(str + offset)); // load +#ifdef __AVX2__ + // zero extend to an YMM register + const __m256i extended = _mm256_cvtepu8_epi16(chunk); + + // store + _mm256_storeu_si256((__m256i*)(dst + offset), extended); +#else + const __m128i nullMask = _mm_set1_epi32(0); // unpack the first 8 bytes, padding with zeros const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); @@ -233,6 +240,7 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size) // unpack the last 8 bytes, padding with zeros const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store +#endif } size = size % 16; @@ -540,8 +548,20 @@ static int ucstrncmp(const QChar *a, const uchar *c, int l) // and c[offset..offset+15] (16 bytes) for ( ; uc + offset + 15 < e; offset += 16) { // similar to fromLatin1_helper: - // load Latin 1 data and expand to UTF-16 + // load 16 bytes of Latin 1 data __m128i chunk = _mm_loadu_si128((__m128i*)(c + offset)); + +# ifdef __AVX2__ + // expand Latin 1 data via zero extension + __m256i ldata = _mm256_cvtepu8_epi16(chunk); + + // load UTF-16 data and compare + __m256i ucdata = _mm256_loadu_si256((__m256i*)(uc + offset)); + __m256i result = _mm256_cmpeq_epi16(ldata, ucdata); + + uint mask = ~_mm256_movemask_epi8(result); +# else + // expand via unpacking __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullmask); __m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask); @@ -552,6 +572,7 @@ static int ucstrncmp(const QChar *a, const uchar *c, int l) __m128i result2 = _mm_cmpeq_epi16(secondHalf, ucdata2); uint mask = ~(_mm_movemask_epi8(result1) | _mm_movemask_epi8(result2) << 16); +# endif if (mask) { // found a different character uint idx = uint(_bit_scan_forward(mask));