Use VPMASKMOV in the epilogue ARGB->ARGB{32,64} AVX2 epilogues

Instead of stepping down to 4 pixels, then 2 px, then 1, with
essentially the same code, let's use maskload and maskstore to only load
and store the effective portions (instructions new in AVX2). The
secondary loop gets run at most twice, since there can be at most 7
pixels left.

This fixes an off-by-4 bug in the previous implementation (lines 1041
and 1186 should have had 7 instead of 3).

Change-Id: I4d4dadb709f1482fa8ccfffd157862e77ac508f6
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
Thiago Macieira 2019-01-09 20:40:18 -08:00
parent 07580a8d7b
commit 612e4c5233

View File

@ -995,6 +995,18 @@ void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint *
}
}
static inline __m128i maskFromCount(qsizetype count)
{
Q_ASSERT(count > 0);
static const qint64 data[] = { -1, -1, 0, 0 };
auto ptr = reinterpret_cast<const quint8 *>(data) + sizeof(__m128i);
if (count > int(sizeof(__m128i)))
return _mm_set1_epi8(-1);
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr - count));
}
template<bool RGBA>
static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype count)
{
@ -1008,9 +1020,11 @@ static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype
for (; i < count - 7; i += 8) {
__m256i srcVector = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + i));
if (!_mm256_testz_si256(srcVector, alphaMask)) {
if (!_mm256_testc_si256(srcVector, alphaMask)) {
if (RGBA)
srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
// keep the two _mm_test[zc]_siXXX next to each other
bool cf = _mm256_testc_si256(srcVector, alphaMask);
if (RGBA)
srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
if (!cf) {
__m256i src1 = _mm256_unpacklo_epi8(srcVector, zero);
__m256i src2 = _mm256_unpackhi_epi8(srcVector, zero);
__m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
@ -1028,8 +1042,6 @@ static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype
srcVector = _mm256_packus_epi16(src1, src2);
_mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), srcVector);
} else {
if (RGBA)
srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
if (buffer != src || RGBA)
_mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), srcVector);
}
@ -1038,12 +1050,18 @@ static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype
}
}
if (i < count - 3) {
__m128i srcVector = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + i));
if (!_mm_testz_si128(srcVector, _mm256_castsi256_si128(alphaMask))) {
if (!_mm_testc_si128(srcVector, _mm256_castsi256_si128(alphaMask))) {
if (RGBA)
srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask));
for ( ; i < count; i += 4) {
__m128i maskedAlphaMask = _mm256_castsi256_si128(alphaMask);
__m128i mask = maskFromCount((count - i) * sizeof(*src));
maskedAlphaMask = _mm_and_si128(mask, maskedAlphaMask);
__m128i srcVector = _mm_maskload_epi32(reinterpret_cast<const int *>(src), mask);
if (!_mm_testz_si128(srcVector, maskedAlphaMask)) {
// keep the two _mm_test[zc]_siXXX next to each other
bool cf = _mm_testc_si128(srcVector, maskedAlphaMask);
if (RGBA)
srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask));
if (!cf) {
__m128i src1 = _mm_unpacklo_epi8(srcVector, _mm256_castsi256_si128(zero));
__m128i src2 = _mm_unpackhi_epi8(srcVector, _mm256_castsi256_si128(zero));
__m128i alpha1 = _mm_shuffle_epi8(src1, _mm256_castsi256_si128(shuffleMask));
@ -1058,54 +1076,15 @@ static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype
src2 = _mm_srli_epi16(src2, 8);
src1 = _mm_blend_epi16(src1, alpha1, 0x88);
src2 = _mm_blend_epi16(src2, alpha2, 0x88);
_mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + i), srcVector);
srcVector = _mm_packus_epi16(src1, src2);
_mm_maskstore_epi32(reinterpret_cast<int *>(buffer + i), mask, srcVector);
} else {
if (RGBA)
srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask));
if (buffer != src || RGBA)
_mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + i), srcVector);
_mm_maskstore_epi32(reinterpret_cast<int *>(buffer + i), mask, srcVector);
}
} else {
_mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + i), _mm256_castsi256_si128(zero));
_mm_maskstore_epi32(reinterpret_cast<int *>(buffer + i), mask, _mm256_castsi256_si128(zero));
}
i += 4;
}
auto convert_half = [=](__m128i &srcVector) {
if (!_mm_testz_si128(srcVector, _mm256_castsi256_si128(alphaMask))) {
if (!_mm_testc_si128(srcVector, _mm256_castsi256_si128(alphaMask))) {
if (RGBA)
srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask));
__m128i src1 = _mm_unpacklo_epi8(srcVector, _mm256_castsi256_si128(zero));
__m128i alpha1 = _mm_shuffle_epi8(src1, _mm256_castsi256_si128(shuffleMask));
src1 = _mm_mullo_epi16(src1, alpha1);
src1 = _mm_add_epi16(src1, _mm_srli_epi16(src1, 8));
src1 = _mm_add_epi16(src1, _mm256_castsi256_si128(half));
src1 = _mm_srli_epi16(src1, 8);
src1 = _mm_blend_epi16(src1, alpha1, 0x88);
srcVector = _mm_packus_epi16(src1, src1);
return true;
} else {
if (RGBA)
srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask));
return buffer != src || RGBA;
}
} else {
srcVector = _mm256_castsi256_si128(zero);
return true;
}
};
if (i < count - 1) {
__m128i srcVector = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src + i));
if (convert_half(srcVector))
_mm_storel_epi64(reinterpret_cast<__m128i *>(buffer + i), srcVector);
i += 2;
}
if (i != count) {
__m128i srcVector = _mm_cvtsi32_si128(src[i]);
if (convert_half(srcVector))
buffer[i] = _mm_cvtsi128_si32(srcVector);
}
}
@ -1183,13 +1162,19 @@ static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizety
_mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i) + 1, src2);
}
if (i < count - 3) {
__m128i srcVector = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + i));
for ( ; i < count; i += 4) {
__m128i maskedAlphaMask = _mm256_castsi256_si128(alphaMask);
__m128i mask = maskFromCount((count - i) * sizeof(*src));
maskedAlphaMask = _mm_and_si128(mask, maskedAlphaMask);
__m128i srcVector = _mm_maskload_epi32(reinterpret_cast<const int *>(src), mask);
__m256i src;
if (!_mm_testz_si128(srcVector, _mm256_castsi256_si128(alphaMask))) {
if (!_mm_testc_si128(srcVector, _mm256_castsi256_si128(alphaMask))) {
if (!RGBA)
srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask));
if (!_mm_testz_si128(srcVector, maskedAlphaMask)) {
// keep the two _mm_test[zc]_siXXX next to each other
bool cf = _mm_testc_si128(srcVector, maskedAlphaMask);
if (!RGBA)
srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask));
if (!cf) {
src = _mm256_cvtepu8_epi16(srcVector);
__m256i alpha = _mm256_shuffle_epi8(src, shuffleMask);
src = _mm256_mullo_epi16(src, alpha);
@ -1200,8 +1185,6 @@ static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizety
src = _mm256_add_epi16(src, _mm256_srli_epi16(src, 7));
src = _mm256_blend_epi16(src, alpha, 0x88);
} else {
if (!RGBA)
srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask));
const __m128i src1 = _mm_unpacklo_epi8(srcVector, srcVector);
const __m128i src2 = _mm_unpackhi_epi8(srcVector, srcVector);
src = _mm256_castsi128_si256(src1);
@ -1210,42 +1193,9 @@ static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizety
} else {
src = zero;
}
_mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), src);
i += 4;
}
auto convert_half = [=](__m128i &srcVector) {
if (!_mm_testz_si128(srcVector, _mm256_castsi256_si128(alphaMask))) {
if (!_mm_testc_si128(srcVector, _mm256_castsi256_si128(alphaMask))) {
if (!RGBA)
srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask));
__m128i src1 = _mm_unpacklo_epi8(srcVector, _mm256_castsi256_si128(zero));
__m128i alpha1 = _mm_shuffle_epi8(src1, _mm256_castsi256_si128(shuffleMask));
src1 = _mm_mullo_epi16(src1, alpha1);
alpha1 = _mm_unpacklo_epi8(srcVector, srcVector);
src1 = _mm_add_epi16(src1, _mm_srli_epi16(src1, 7));
src1 = _mm_blend_epi16(src1, alpha1, 0x88);
return src1;
} else {
if (!RGBA)
srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask));
const __m128i src1 = _mm_unpacklo_epi8(srcVector, srcVector);
return src1;
}
} else {
return _mm256_castsi256_si128(zero);
}
__m256i xmask = _mm256_cvtepi32_epi64(mask);
_mm256_maskstore_epi64(reinterpret_cast<qint64 *>(buffer + i), xmask, src);
};
if (i < count - 1) {
__m128i srcVector = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src + i));
_mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + i), convert_half(srcVector));
i += 2;
}
if (i != count) {
__m128i srcVector = _mm_cvtsi32_si128(src[i]);
_mm_storel_epi64(reinterpret_cast<__m128i *>(buffer + i), convert_half(srcVector));
}
}
const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, int count,