Add AVX2 versions of the fast blending functions
This patch adds AVX2 versions of the fast blending functions that we already have SSE2 versions of. Change-Id: Ifd1a22f7891b6208cb74929ad26095d12c5a1efb Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
2d2d90781a
commit
8b2f91e328
@ -468,6 +468,9 @@ static inline quint64 qCpuFeatures()
|
||||
#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
|
||||
for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
|
||||
|
||||
#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
|
||||
for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
|
||||
|
||||
QT_END_NAMESPACE
|
||||
|
||||
#endif // QSIMD_P_H
|
||||
|
@ -6548,6 +6548,15 @@ static void qInitDrawhelperFunctions()
|
||||
|
||||
qt_fetch_radial_gradient = qt_fetch_radial_gradient_sse2;
|
||||
|
||||
extern void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_Source_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_Plus_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_sse2;
|
||||
qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_sse2;
|
||||
qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_sse2;
|
||||
qt_functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_sse2;
|
||||
|
||||
#ifdef QT_COMPILER_SUPPORTS_SSSE3
|
||||
if (qCpuHasFeature(SSSE3)) {
|
||||
extern void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
|
||||
@ -6592,24 +6601,39 @@ static void qInitDrawhelperFunctions()
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(QT_COMPILER_SUPPORTS_AVX2) && !defined(__AVX2__)
|
||||
#if defined(QT_COMPILER_SUPPORTS_AVX2)
|
||||
if (qCpuHasFeature(AVX2)) {
|
||||
#if !defined(__AVX2__)
|
||||
extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_avx2;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_avx2;
|
||||
#endif
|
||||
extern void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl,
|
||||
int w, int h, int const_alpha);
|
||||
extern void qt_blend_argb32_on_argb32_avx2(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl,
|
||||
int w, int h, int const_alpha);
|
||||
qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_avx2;
|
||||
qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_avx2;
|
||||
qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_avx2;
|
||||
qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_avx2;
|
||||
qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_avx2;
|
||||
qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_avx2;
|
||||
qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_avx2;
|
||||
qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_avx2;
|
||||
|
||||
extern void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, uint color, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_Source_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_avx2;
|
||||
qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_avx2;
|
||||
qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_avx2;
|
||||
}
|
||||
#endif
|
||||
extern void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_Source_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_Plus_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_sse2;
|
||||
qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_sse2;
|
||||
qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_sse2;
|
||||
qt_functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_sse2;
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
|
@ -37,12 +37,14 @@
|
||||
**
|
||||
****************************************************************************/
|
||||
|
||||
#include <private/qdrawhelper_p.h>
|
||||
#include "qdrawhelper_p.h"
|
||||
#include "qdrawingprimitive_sse2_p.h"
|
||||
|
||||
#if defined(QT_COMPILER_SUPPORTS_AVX2)
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
// Autovectorized premultiply functions:
|
||||
const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
@ -55,6 +57,307 @@ const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint
|
||||
return qt_convertRGBA8888ToARGB32PM(buffer, src, count);
|
||||
}
|
||||
|
||||
// Vectorized blend functions:
|
||||
|
||||
// See BYTE_MUL_SSE2 for details.
|
||||
inline static void BYTE_MUL_AVX2(__m256i &pixelVector, const __m256i &alphaChannel, const __m256i &colorMask, const __m256i &half)
|
||||
{
|
||||
__m256i pixelVectorAG = _mm256_srli_epi16(pixelVector, 8);
|
||||
__m256i pixelVectorRB = _mm256_and_si256(pixelVector, colorMask);
|
||||
|
||||
pixelVectorAG = _mm256_mullo_epi16(pixelVectorAG, alphaChannel);
|
||||
pixelVectorRB = _mm256_mullo_epi16(pixelVectorRB, alphaChannel);
|
||||
|
||||
pixelVectorRB = _mm256_add_epi16(pixelVectorRB, _mm256_srli_epi16(pixelVectorRB, 8));
|
||||
pixelVectorAG = _mm256_add_epi16(pixelVectorAG, _mm256_srli_epi16(pixelVectorAG, 8));
|
||||
pixelVectorRB = _mm256_add_epi16(pixelVectorRB, half);
|
||||
pixelVectorAG = _mm256_add_epi16(pixelVectorAG, half);
|
||||
|
||||
pixelVectorRB = _mm256_srli_epi16(pixelVectorRB, 8);
|
||||
pixelVectorAG = _mm256_andnot_si256(colorMask, pixelVectorAG);
|
||||
|
||||
pixelVector = _mm256_or_si256(pixelVectorAG, pixelVectorRB);
|
||||
}
|
||||
|
||||
// See INTERPOLATE_PIXEL_255_SSE2 for details.
|
||||
inline static void INTERPOLATE_PIXEL_255_AVX2(const __m256i &srcVector, __m256i &dstVector, const __m256i &alphaChannel, const __m256i &oneMinusAlphaChannel, const __m256i &colorMask, const __m256i &half)
|
||||
{
|
||||
const __m256i srcVectorAG = _mm256_srli_epi16(srcVector, 8);
|
||||
const __m256i dstVectorAG = _mm256_srli_epi16(dstVector, 8);
|
||||
const __m256i srcVectorRB = _mm256_and_si256(srcVector, colorMask);
|
||||
const __m256i dstVectorRB = _mm256_and_si256(dstVector, colorMask);
|
||||
const __m256i srcVectorAGalpha = _mm256_mullo_epi16(srcVectorAG, alphaChannel);
|
||||
const __m256i srcVectorRBalpha = _mm256_mullo_epi16(srcVectorRB, alphaChannel);
|
||||
const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi16(dstVectorAG, oneMinusAlphaChannel);
|
||||
const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi16(dstVectorRB, oneMinusAlphaChannel);
|
||||
__m256i finalAG = _mm256_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
|
||||
__m256i finalRB = _mm256_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
|
||||
finalAG = _mm256_add_epi16(finalAG, _mm256_srli_epi16(finalAG, 8));
|
||||
finalRB = _mm256_add_epi16(finalRB, _mm256_srli_epi16(finalRB, 8));
|
||||
finalAG = _mm256_add_epi16(finalAG, half);
|
||||
finalRB = _mm256_add_epi16(finalRB, half);
|
||||
finalAG = _mm256_andnot_si256(colorMask, finalAG);
|
||||
finalRB = _mm256_srli_epi16(finalRB, 8);
|
||||
|
||||
dstVector = _mm256_or_si256(finalAG, finalRB);
|
||||
}
|
||||
|
||||
// See BLEND_SOURCE_OVER_ARGB32_SSE2 for details.
|
||||
inline static void BLEND_SOURCE_OVER_ARGB32_AVX2(quint32 *dst, const quint32 *src, const int length)
|
||||
{
|
||||
const __m256i half = _mm256_set1_epi16(0x80);
|
||||
const __m256i one = _mm256_set1_epi16(0xff);
|
||||
const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
|
||||
const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
|
||||
const __m256i offsetMask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
const __m256i alphaShuffleMask = _mm256_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3,
|
||||
char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
|
||||
|
||||
const int minusOffsetToAlignDstOn32Bytes = (reinterpret_cast<quintptr>(dst) >> 2) & 0x7;
|
||||
|
||||
int x = 0;
|
||||
// Prologue to handle all pixels until dst is 32-byte aligned in one step.
|
||||
if (minusOffsetToAlignDstOn32Bytes != 0 && x < (length - 7)) {
|
||||
const __m256i prologueMask = _mm256_sub_epi32(_mm256_set1_epi32(minusOffsetToAlignDstOn32Bytes - 1), offsetMask);
|
||||
const __m256i srcVector = _mm256_maskload_epi32((const int *)&src[x - minusOffsetToAlignDstOn32Bytes], prologueMask);
|
||||
const __m256i prologueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, prologueMask);
|
||||
if (!_mm256_testz_si256(srcVector, prologueAlphaMask)) {
|
||||
if (_mm256_testc_si256(srcVector, prologueAlphaMask)) {
|
||||
_mm256_maskstore_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask, srcVector);
|
||||
} else {
|
||||
__m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
|
||||
alphaChannel = _mm256_sub_epi16(one, alphaChannel);
|
||||
__m256i dstVector = _mm256_maskload_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask);
|
||||
BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
|
||||
dstVector = _mm256_add_epi8(dstVector, srcVector);
|
||||
_mm256_maskstore_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask, dstVector);
|
||||
}
|
||||
}
|
||||
x += (8 - minusOffsetToAlignDstOn32Bytes);
|
||||
}
|
||||
|
||||
for (; x < (length - 7); x += 8) {
|
||||
const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
|
||||
if (!_mm256_testz_si256(srcVector, alphaMask)) {
|
||||
if (_mm256_testc_si256(srcVector, alphaMask)) {
|
||||
_mm256_store_si256((__m256i *)&dst[x], srcVector);
|
||||
} else {
|
||||
__m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
|
||||
alphaChannel = _mm256_sub_epi16(one, alphaChannel);
|
||||
__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
|
||||
BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
|
||||
dstVector = _mm256_add_epi8(dstVector, srcVector);
|
||||
_mm256_store_si256((__m256i *)&dst[x], dstVector);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Epilogue to handle all remaining pixels in one step.
|
||||
if (x < length) {
|
||||
const __m256i epilogueMask = _mm256_add_epi32(offsetMask, _mm256_set1_epi32(x - length));
|
||||
const __m256i srcVector = _mm256_maskload_epi32((const int *)&src[x], epilogueMask);
|
||||
const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
|
||||
if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
|
||||
if (_mm256_testc_si256(srcVector, epilogueAlphaMask)) {
|
||||
_mm256_maskstore_epi32((int *)&dst[x], epilogueMask, srcVector);
|
||||
} else {
|
||||
__m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
|
||||
alphaChannel = _mm256_sub_epi16(one, alphaChannel);
|
||||
__m256i dstVector = _mm256_maskload_epi32((int *)&dst[x], epilogueMask);
|
||||
BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
|
||||
dstVector = _mm256_add_epi8(dstVector, srcVector);
|
||||
_mm256_maskstore_epi32((int *)&dst[x], epilogueMask, dstVector);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// See BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2 for details.
|
||||
inline static void BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(quint32 *dst, const quint32 *src, const int length, const int const_alpha)
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
|
||||
blend_pixel(dst[x], src[x], const_alpha);
|
||||
|
||||
const __m256i half = _mm256_set1_epi16(0x80);
|
||||
const __m256i one = _mm256_set1_epi16(0xff);
|
||||
const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
|
||||
const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
|
||||
const __m256i alphaShuffleMask = _mm256_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3,
|
||||
char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
|
||||
const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
|
||||
for (; x < (length - 7); x += 8) {
|
||||
__m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
|
||||
if (!_mm256_testz_si256(srcVector, alphaMask)) {
|
||||
BYTE_MUL_AVX2(srcVector, constAlphaVector, colorMask, half);
|
||||
|
||||
__m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
|
||||
alphaChannel = _mm256_sub_epi16(one, alphaChannel);
|
||||
__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
|
||||
BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
|
||||
dstVector = _mm256_add_epi8(dstVector, srcVector);
|
||||
_mm256_store_si256((__m256i *)&dst[x], dstVector);
|
||||
}
|
||||
}
|
||||
for (; x < length; ++x)
|
||||
blend_pixel(dst[x], src[x], const_alpha);
|
||||
}
|
||||
|
||||
void qt_blend_argb32_on_argb32_avx2(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl,
|
||||
int w, int h,
|
||||
int const_alpha)
|
||||
{
|
||||
if (const_alpha == 256) {
|
||||
for (int y = 0; y < h; ++y) {
|
||||
const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
|
||||
quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
|
||||
BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, w);
|
||||
destPixels += dbpl;
|
||||
srcPixels += sbpl;
|
||||
}
|
||||
} else if (const_alpha != 0) {
|
||||
const_alpha = (const_alpha * 255) >> 8;
|
||||
for (int y = 0; y < h; ++y) {
|
||||
const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
|
||||
quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
|
||||
BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, w, const_alpha);
|
||||
destPixels += dbpl;
|
||||
srcPixels += sbpl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl,
|
||||
int w, int h,
|
||||
int const_alpha)
|
||||
{
|
||||
if (const_alpha == 256) {
|
||||
for (int y = 0; y < h; ++y) {
|
||||
const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
|
||||
quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
|
||||
::memcpy(dst, src, w * sizeof(uint));
|
||||
srcPixels += sbpl;
|
||||
destPixels += dbpl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (const_alpha == 0)
|
||||
return;
|
||||
|
||||
const __m256i half = _mm256_set1_epi16(0x80);
|
||||
const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
|
||||
|
||||
const_alpha = (const_alpha * 255) >> 8;
|
||||
int one_minus_const_alpha = 255 - const_alpha;
|
||||
const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
|
||||
const __m256i oneMinusConstAlpha = _mm256_set1_epi16(one_minus_const_alpha);
|
||||
for (int y = 0; y < h; ++y) {
|
||||
const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
|
||||
quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
|
||||
int x = 0;
|
||||
|
||||
// First, align dest to 32 bytes:
|
||||
ALIGNMENT_PROLOGUE_32BYTES(dst, x, w)
|
||||
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
|
||||
|
||||
// 2) interpolate pixels with AVX2
|
||||
for (; x < (w - 7); x += 8) {
|
||||
const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
|
||||
if (!_mm256_testc_si256(srcVector, _mm256_setzero_si256())) {
|
||||
__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
|
||||
INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
|
||||
_mm256_store_si256((__m256i *)&dst[x], dstVector);
|
||||
}
|
||||
}
|
||||
|
||||
// 3) Epilogue
|
||||
for (; x < w; ++x)
|
||||
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
|
||||
|
||||
srcPixels += sbpl;
|
||||
destPixels += dbpl;
|
||||
}
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha)
|
||||
{
|
||||
Q_ASSERT(const_alpha < 256);
|
||||
|
||||
const quint32 *src = (const quint32 *) srcPixels;
|
||||
quint32 *dst = (quint32 *) destPixels;
|
||||
|
||||
if (const_alpha == 255)
|
||||
BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, length);
|
||||
else
|
||||
BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, length, const_alpha);
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_Source_avx2(uint *dst, const uint *src, int length, uint const_alpha)
|
||||
{
|
||||
if (const_alpha == 255) {
|
||||
::memcpy(dst, src, length * sizeof(uint));
|
||||
} else {
|
||||
const int ialpha = 255 - const_alpha;
|
||||
|
||||
int x = 0;
|
||||
|
||||
// 1) prologue, align on 32 bytes
|
||||
ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
|
||||
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
|
||||
|
||||
// 2) interpolate pixels with AVX2
|
||||
const __m256i half = _mm256_set1_epi16(0x80);
|
||||
const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
|
||||
const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
|
||||
const __m256i oneMinusConstAlpha = _mm256_set1_epi16(ialpha);
|
||||
for (; x < length - 7; x += 8) {
|
||||
const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
|
||||
__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
|
||||
INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
|
||||
_mm256_store_si256((__m256i *)&dst[x], dstVector);
|
||||
}
|
||||
|
||||
// 3) Epilogue
|
||||
for (; x < length; ++x)
|
||||
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
|
||||
}
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, uint color, uint const_alpha)
|
||||
{
|
||||
if ((const_alpha & qAlpha(color)) == 255) {
|
||||
qt_memfill32(destPixels, color, length);
|
||||
} else {
|
||||
if (const_alpha != 255)
|
||||
color = BYTE_MUL(color, const_alpha);
|
||||
|
||||
const quint32 minusAlphaOfColor = qAlpha(~color);
|
||||
int x = 0;
|
||||
|
||||
quint32 *dst = (quint32 *) destPixels;
|
||||
const __m256i colorVector = _mm256_set1_epi32(color);
|
||||
const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
|
||||
const __m256i half = _mm256_set1_epi16(0x80);
|
||||
const __m256i minusAlphaOfColorVector = _mm256_set1_epi16(minusAlphaOfColor);
|
||||
|
||||
ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
|
||||
destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
|
||||
|
||||
for (; x < length - 7; x += 8) {
|
||||
__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
|
||||
BYTE_MUL_AVX2(dstVector, minusAlphaOfColorVector, colorMask, half);
|
||||
dstVector = _mm256_add_epi8(colorVector, dstVector);
|
||||
_mm256_store_si256((__m256i *)&dst[x], dstVector);
|
||||
}
|
||||
for (; x < length; ++x)
|
||||
destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
|
||||
}
|
||||
}
|
||||
|
||||
QT_END_NAMESPACE
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user