Improve toLatin1 x86 SIMD by using a new SSE4.1 instruction

The new instruction is PBLENDVB, which creates a result by selecting
bytes from one of two registers, depending on whether the mask
contains a 1 (0xff) or a zero.

The SSE2 code requires three instructions (and, andnot, or).

The equivalent Neon instruction is VBSL (bit select).

Reviewed-by: Samuel Rødal
(cherry picked from commit bdad106358ae177d1345f5ff85c0e38cfeb5ca90)

Change-Id: I5b0d055a4be532f81c6f11181d710525cd6c3f25
Reviewed-on: http://codereview.qt-project.org/4466
Reviewed-by: Qt Sanity Bot <qt_sanity_bot@ovi.com>
Reviewed-by: Oswald Buddenhagen <oswald.buddenhagen@nokia.com>
Reviewed-by: Samuel Rødal <samuel.rodal@nokia.com>
This commit is contained in:
Thiago Macieira 2010-12-22 14:42:33 +01:00 committed by Qt by Nokia
parent cd781b732e
commit 85f963b2f1

View File

@ -3558,6 +3558,10 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset); const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset);
const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
#ifdef __SSE4_1__
chunk1 = _mm_blendv_epi8(chunk1, questionMark, offLimitMask);
#else
// offLimitQuestionMark contains '?' for each 16 bits that was off-limit // offLimitQuestionMark contains '?' for each 16 bits that was off-limit
// the 16 bits that were correct contains zeros // the 16 bits that were correct contains zeros
const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
@ -3568,6 +3572,7 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
// merge offLimitQuestionMark and correctBytes to have the result // merge offLimitQuestionMark and correctBytes to have the result
chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark); chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark);
#endif
} }
__m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load
@ -3576,9 +3581,13 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
// exactly the same operations as for the previous chunk of data // exactly the same operations as for the previous chunk of data
const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset); const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset);
const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
#ifdef __SSE4_1__
chunk2 = _mm_blendv_epi8(chunk2, questionMark, offLimitMask);
#else
const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2); const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2);
chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark); chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark);
#endif
} }
// pack the two vector to 16 x 8bits elements // pack the two vector to 16 x 8bits elements