Add AVX2 optimized versions of the most basic RGB64 compositions
Speeds up RGB30 and ARGB32-unpremul painting. Change-Id: I419afdf5c26ceffc0f7557b8f196035056178c9a Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
8ba8efb839
commit
f95fbca5f3
@ -6399,12 +6399,19 @@ static void qInitDrawhelperFunctions()
|
||||
qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_avx2;
|
||||
qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_avx2;
|
||||
|
||||
extern void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, uint color, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_Source_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_avx2;
|
||||
qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_avx2;
|
||||
extern void QT_FASTCALL comp_func_Source_rgb64_avx2(QRgba64 *destPixels, const QRgba64 *srcPixels, int length, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_SourceOver_rgb64_avx2(QRgba64 *destPixels, const QRgba64 *srcPixels, int length, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, uint color, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_solid_SourceOver_rgb64_avx2(QRgba64 *destPixels, int length, QRgba64 color, uint const_alpha);
|
||||
|
||||
qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_avx2;
|
||||
qt_functionForMode64_C[QPainter::CompositionMode_Source] = comp_func_Source_rgb64_avx2;
|
||||
qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_avx2;
|
||||
qt_functionForMode64_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_rgb64_avx2;
|
||||
qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_avx2;
|
||||
qt_functionForModeSolid64_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_rgb64_avx2;
|
||||
|
||||
extern void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_upscale_helper_avx2(uint *b, uint *end, const QTextureData &image,
|
||||
int &fx, int &fy, int fdx, int /*fdy*/);
|
||||
|
@ -39,6 +39,7 @@
|
||||
|
||||
#include "qdrawhelper_p.h"
|
||||
#include "qdrawingprimitive_sse2_p.h"
|
||||
#include "qrgba64_p.h"
|
||||
|
||||
#if defined(QT_COMPILER_SUPPORTS_AVX2)
|
||||
|
||||
@ -73,6 +74,25 @@ inline static void BYTE_MUL_AVX2(__m256i &pixelVector, const __m256i &alphaChann
|
||||
pixelVector = _mm256_or_si256(pixelVectorAG, pixelVectorRB);
|
||||
}
|
||||
|
||||
inline static void BYTE_MUL_RGB64_AVX2(__m256i &pixelVector, const __m256i &alphaChannel, const __m256i &colorMask, const __m256i &half)
|
||||
{
|
||||
__m256i pixelVectorAG = _mm256_srli_epi32(pixelVector, 16);
|
||||
__m256i pixelVectorRB = _mm256_and_si256(pixelVector, colorMask);
|
||||
|
||||
pixelVectorAG = _mm256_mullo_epi32(pixelVectorAG, alphaChannel);
|
||||
pixelVectorRB = _mm256_mullo_epi32(pixelVectorRB, alphaChannel);
|
||||
|
||||
pixelVectorRB = _mm256_add_epi32(pixelVectorRB, _mm256_srli_epi32(pixelVectorRB, 16));
|
||||
pixelVectorAG = _mm256_add_epi32(pixelVectorAG, _mm256_srli_epi32(pixelVectorAG, 16));
|
||||
pixelVectorRB = _mm256_add_epi32(pixelVectorRB, half);
|
||||
pixelVectorAG = _mm256_add_epi32(pixelVectorAG, half);
|
||||
|
||||
pixelVectorRB = _mm256_srli_epi32(pixelVectorRB, 16);
|
||||
pixelVectorAG = _mm256_andnot_si256(colorMask, pixelVectorAG);
|
||||
|
||||
pixelVector = _mm256_or_si256(pixelVectorAG, pixelVectorRB);
|
||||
}
|
||||
|
||||
// See INTERPOLATE_PIXEL_255_SSE2 for details.
|
||||
inline static void INTERPOLATE_PIXEL_255_AVX2(const __m256i &srcVector, __m256i &dstVector, const __m256i &alphaChannel, const __m256i &oneMinusAlphaChannel, const __m256i &colorMask, const __m256i &half)
|
||||
{
|
||||
@ -96,6 +116,29 @@ inline static void INTERPOLATE_PIXEL_255_AVX2(const __m256i &srcVector, __m256i
|
||||
dstVector = _mm256_or_si256(finalAG, finalRB);
|
||||
}
|
||||
|
||||
inline static void INTERPOLATE_PIXEL_RGB64_AVX2(const __m256i &srcVector, __m256i &dstVector, const __m256i &alphaChannel, const __m256i &oneMinusAlphaChannel, const __m256i &colorMask, const __m256i &half)
|
||||
{
|
||||
const __m256i srcVectorAG = _mm256_srli_epi32(srcVector, 16);
|
||||
const __m256i dstVectorAG = _mm256_srli_epi32(dstVector, 16);
|
||||
const __m256i srcVectorRB = _mm256_and_si256(srcVector, colorMask);
|
||||
const __m256i dstVectorRB = _mm256_and_si256(dstVector, colorMask);
|
||||
const __m256i srcVectorAGalpha = _mm256_mullo_epi32(srcVectorAG, alphaChannel);
|
||||
const __m256i srcVectorRBalpha = _mm256_mullo_epi32(srcVectorRB, alphaChannel);
|
||||
const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi32(dstVectorAG, oneMinusAlphaChannel);
|
||||
const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi32(dstVectorRB, oneMinusAlphaChannel);
|
||||
__m256i finalAG = _mm256_add_epi32(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
|
||||
__m256i finalRB = _mm256_add_epi32(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
|
||||
finalAG = _mm256_add_epi32(finalAG, _mm256_srli_epi32(finalAG, 16));
|
||||
finalRB = _mm256_add_epi32(finalRB, _mm256_srli_epi32(finalRB, 16));
|
||||
finalAG = _mm256_add_epi32(finalAG, half);
|
||||
finalRB = _mm256_add_epi32(finalRB, half);
|
||||
finalAG = _mm256_andnot_si256(colorMask, finalAG);
|
||||
finalRB = _mm256_srli_epi32(finalRB, 16);
|
||||
|
||||
dstVector = _mm256_or_si256(finalAG, finalRB);
|
||||
}
|
||||
|
||||
|
||||
// See BLEND_SOURCE_OVER_ARGB32_SSE2 for details.
|
||||
inline static void BLEND_SOURCE_OVER_ARGB32_AVX2(quint32 *dst, const quint32 *src, const int length)
|
||||
{
|
||||
@ -288,6 +331,64 @@ void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixe
|
||||
BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, length, const_alpha);
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_SourceOver_rgb64_avx2(QRgba64 *dst, const QRgba64 *src, int length, uint const_alpha)
|
||||
{
|
||||
Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
|
||||
const __m256i half = _mm256_set1_epi32(0x8000);
|
||||
const __m256i one = _mm256_set1_epi32(0xffff);
|
||||
const __m256i colorMask = _mm256_set1_epi32(0x0000ffff);
|
||||
__m256i alphaMask = _mm256_set1_epi32(0xff000000);
|
||||
alphaMask = _mm256_unpacklo_epi8(alphaMask, alphaMask);
|
||||
const __m256i alphaShuffleMask = _mm256_set_epi8(char(0xff),char(0xff),15,14,char(0xff),char(0xff),15,14,char(0xff),char(0xff),7,6,char(0xff),char(0xff),7,6,
|
||||
char(0xff),char(0xff),15,14,char(0xff),char(0xff),15,14,char(0xff),char(0xff),7,6,char(0xff),char(0xff),7,6);
|
||||
|
||||
if (const_alpha == 255) {
|
||||
int x = 0;
|
||||
for (; x < length && (quintptr(dst + x) & 31); ++x)
|
||||
blend_pixel(dst[x], src[x]);
|
||||
for (; x < length - 3; x += 4) {
|
||||
const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
|
||||
if (!_mm256_testz_si256(srcVector, alphaMask)) {
|
||||
// Not all transparent
|
||||
if (_mm256_testc_si256(srcVector, alphaMask)) {
|
||||
// All opaque
|
||||
_mm256_store_si256((__m256i *)&dst[x], srcVector);
|
||||
} else {
|
||||
__m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
|
||||
alphaChannel = _mm256_sub_epi32(one, alphaChannel);
|
||||
__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
|
||||
BYTE_MUL_RGB64_AVX2(dstVector, alphaChannel, colorMask, half);
|
||||
dstVector = _mm256_add_epi16(dstVector, srcVector);
|
||||
_mm256_store_si256((__m256i *)&dst[x], dstVector);
|
||||
}
|
||||
}
|
||||
}
|
||||
SIMD_EPILOGUE(x, length, 3)
|
||||
blend_pixel(dst[x], src[x]);
|
||||
} else {
|
||||
const __m256i constAlphaVector = _mm256_set1_epi32(const_alpha | (const_alpha << 8));
|
||||
int x = 0;
|
||||
for (; x < length && (quintptr(dst + x) & 31); ++x)
|
||||
blend_pixel(dst[x], src[x], const_alpha);
|
||||
for (; x < length - 3; x += 4) {
|
||||
__m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
|
||||
if (!_mm256_testz_si256(srcVector, alphaMask)) {
|
||||
// Not all transparent
|
||||
BYTE_MUL_RGB64_AVX2(srcVector, constAlphaVector, colorMask, half);
|
||||
|
||||
__m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
|
||||
alphaChannel = _mm256_sub_epi32(one, alphaChannel);
|
||||
__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
|
||||
BYTE_MUL_RGB64_AVX2(dstVector, alphaChannel, colorMask, half);
|
||||
dstVector = _mm256_add_epi16(dstVector, srcVector);
|
||||
_mm256_store_si256((__m256i *)&dst[x], dstVector);
|
||||
}
|
||||
}
|
||||
SIMD_EPILOGUE(x, length, 3)
|
||||
blend_pixel(dst[x], src[x], const_alpha);
|
||||
}
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_Source_avx2(uint *dst, const uint *src, int length, uint const_alpha)
|
||||
{
|
||||
if (const_alpha == 255) {
|
||||
@ -319,6 +420,39 @@ void QT_FASTCALL comp_func_Source_avx2(uint *dst, const uint *src, int length, u
|
||||
}
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_Source_rgb64_avx2(QRgba64 *dst, const QRgba64 *src, int length, uint const_alpha)
|
||||
{
|
||||
Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
|
||||
if (const_alpha == 255) {
|
||||
::memcpy(dst, src, length * sizeof(QRgba64));
|
||||
} else {
|
||||
const uint ca = const_alpha | (const_alpha << 8); // adjust to [0-65535]
|
||||
const uint cia = 65535 - const_alpha;
|
||||
|
||||
int x = 0;
|
||||
|
||||
// 1) prologue, align on 32 bytes
|
||||
for (; x < length && (quintptr(dst + x) & 31); ++x)
|
||||
dst[x] = interpolate65535(src[x], ca, dst[x], cia);
|
||||
|
||||
// 2) interpolate pixels with AVX2
|
||||
const __m256i half = _mm256_set1_epi32(0x8000);
|
||||
const __m256i colorMask = _mm256_set1_epi32(0x0000ffff);
|
||||
const __m256i constAlphaVector = _mm256_set1_epi32(ca);
|
||||
const __m256i oneMinusConstAlpha = _mm256_set1_epi32(cia);
|
||||
for (; x < length - 3; x += 4) {
|
||||
const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
|
||||
__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
|
||||
INTERPOLATE_PIXEL_RGB64_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
|
||||
_mm256_store_si256((__m256i *)&dst[x], dstVector);
|
||||
}
|
||||
|
||||
// 3) Epilogue
|
||||
SIMD_EPILOGUE(x, length, 3)
|
||||
dst[x] = interpolate65535(src[x], ca, dst[x], cia);
|
||||
}
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, uint color, uint const_alpha)
|
||||
{
|
||||
if ((const_alpha & qAlpha(color)) == 255) {
|
||||
@ -350,6 +484,37 @@ void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, u
|
||||
}
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_solid_SourceOver_rgb64_avx2(QRgba64 *destPixels, int length, QRgba64 color, uint const_alpha)
|
||||
{
|
||||
Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
|
||||
if (const_alpha == 255 && color.isOpaque()) {
|
||||
qt_memfill64((quint64*)destPixels, color, length);
|
||||
} else {
|
||||
if (const_alpha != 255)
|
||||
color = multiplyAlpha255(color, const_alpha);
|
||||
|
||||
const uint minusAlphaOfColor = ~ushort(color.alpha());
|
||||
int x = 0;
|
||||
quint64 *dst = (quint64 *) destPixels;
|
||||
const __m256i colorVector = _mm256_set1_epi64x(color);
|
||||
const __m256i colorMask = _mm256_set1_epi32(0x0000ffff);
|
||||
const __m256i half = _mm256_set1_epi32(0x8000);
|
||||
const __m256i minusAlphaOfColorVector = _mm256_set1_epi32(minusAlphaOfColor);
|
||||
|
||||
for (; x < length && (quintptr(dst + x) & 31); ++x)
|
||||
destPixels[x] = color + multiplyAlpha65535(destPixels[x], minusAlphaOfColor);
|
||||
|
||||
for (; x < length - 3; x += 4) {
|
||||
__m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
|
||||
BYTE_MUL_RGB64_AVX2(dstVector, minusAlphaOfColorVector, colorMask, half);
|
||||
dstVector = _mm256_add_epi16(colorVector, dstVector);
|
||||
_mm256_store_si256((__m256i *)&dst[x], dstVector);
|
||||
}
|
||||
SIMD_EPILOGUE(x, length, 3)
|
||||
destPixels[x] = color + multiplyAlpha65535(destPixels[x], minusAlphaOfColor);
|
||||
}
|
||||
}
|
||||
|
||||
#define interpolate_4_pixels_16_avx2(tlr1, tlr2, blr1, blr2, distx, disty, colorMask, v_256, b) \
|
||||
{ \
|
||||
/* Correct for later unpack */ \
|
||||
|
@ -299,6 +299,21 @@ inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
|
||||
return blend;
|
||||
}
|
||||
|
||||
static Q_ALWAYS_INLINE void blend_pixel(QRgba64 &dst, QRgba64 src)
|
||||
{
|
||||
if (src.isOpaque())
|
||||
dst = src;
|
||||
else if (!src.isTransparent())
|
||||
dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
|
||||
}
|
||||
|
||||
static Q_ALWAYS_INLINE void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha)
|
||||
{
|
||||
if (!src.isTransparent()) {
|
||||
src = multiplyAlpha255(src, const_alpha);
|
||||
dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
|
||||
}
|
||||
}
|
||||
|
||||
QT_END_NAMESPACE
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user