From 85f963b2f16147ffce243e9acdfbeb46f19b2fed Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Wed, 22 Dec 2010 14:42:33 +0100 Subject: [PATCH] Improve toLatin1 x86 SIMD by using a new SSE4.1 instruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new instruction is PBLENDVB, which creates a result by selecting bytes from one of two registers, depending on whether the mask contains a 1 (0xff) or a zero. The SSE2 code requires three instructions (and, andnot, or). The equivalent Neon instruction is VBSL (bit select). Reviewed-by: Samuel Rødal (cherry picked from commit bdad106358ae177d1345f5ff85c0e38cfeb5ca90) Change-Id: I5b0d055a4be532f81c6f11181d710525cd6c3f25 Reviewed-on: http://codereview.qt-project.org/4466 Reviewed-by: Qt Sanity Bot Reviewed-by: Oswald Buddenhagen Reviewed-by: Samuel Rødal --- src/corelib/tools/qstring.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index c56c050f76..241762636b 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -3558,6 +3558,10 @@ static QByteArray toLatin1_helper(const QChar *data, int length) const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset); const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); +#ifdef __SSE4_1__ + chunk1 = _mm_blendv_epi8(chunk1, questionMark, offLimitMask); +#else + // offLimitQuestionMark contains '?' for each 16 bits that was off-limit // the 16 bits that were correct contains zeros const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); @@ -3568,6 +3572,7 @@ static QByteArray toLatin1_helper(const QChar *data, int length) // merge offLimitQuestionMark and correctBytes to have the result chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark); +#endif } __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load @@ -3576,9 +3581,13 @@ static QByteArray toLatin1_helper(const QChar *data, int length) // exactly the same operations as for the previous chunk of data const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset); const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); +#ifdef __SSE4_1__ + chunk2 = _mm_blendv_epi8(chunk2, questionMark, offLimitMask); +#else const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2); chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark); +#endif } // pack the two vector to 16 x 8bits elements