Optimize unpremultiply using SSE rcp
Change-Id: I255031d354b0fde7abe8366ea2c86a35f9f24afd Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
5e67c653db
commit
dfa434a979
@ -6293,6 +6293,8 @@ static void qInitDrawhelperFunctions()
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern void QT_FASTCALL storeRGBXFromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern void QT_FASTCALL destStore64ARGB32_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length);
|
||||
extern void QT_FASTCALL destStore64RGBA8888_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length);
|
||||
qPixelLayouts[QImage::Format_ARGB32].fetchToARGB32PM = fetchARGB32ToARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].fetchToARGB32PM = fetchRGBA8888ToARGB32PM_sse4;
|
||||
@ -6302,6 +6304,8 @@ static void qInitDrawhelperFunctions()
|
||||
qPixelLayouts[QImage::Format_RGBX8888].storeFromARGB32PM = storeRGBXFromARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_A2BGR30_Premultiplied].storeFromARGB32PM = storeA2RGB30PMFromARGB32PM_sse4<PixelOrderBGR>;
|
||||
qPixelLayouts[QImage::Format_A2RGB30_Premultiplied].storeFromARGB32PM = storeA2RGB30PMFromARGB32PM_sse4<PixelOrderRGB>;
|
||||
destStoreProc64[QImage::Format_ARGB32] = destStore64ARGB32_sse4;
|
||||
destStoreProc64[QImage::Format_RGBA8888] = destStore64RGBA8888_sse4;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -39,6 +39,7 @@
|
||||
|
||||
#include <private/qdrawhelper_p.h>
|
||||
#include <private/qdrawingprimitive_sse2_p.h>
|
||||
#include <private/qpaintengine_raster_p.h>
|
||||
|
||||
#if defined(QT_COMPILER_SUPPORTS_SSE4_1)
|
||||
|
||||
@ -93,6 +94,171 @@ static void convertARGBToARGB32PM_sse4(uint *buffer, const uint *src, int count)
|
||||
}
|
||||
}
|
||||
|
||||
static inline __m128 reciprocal_mul_ps(__m128 a, float mul)
|
||||
{
|
||||
__m128 ia = _mm_rcp_ps(a); // Approximate 1/a
|
||||
// Improve precision of ia using Newton-Raphson
|
||||
ia = _mm_sub_ps(_mm_add_ps(ia, ia), _mm_mul_ps(ia, _mm_mul_ps(ia, a)));
|
||||
ia = _mm_mul_ps(ia, _mm_set1_ps(mul));
|
||||
return ia;
|
||||
}
|
||||
|
||||
template<bool RGBA, bool RGBx>
|
||||
static inline void convertARGBFromARGB32PM_sse4(uint *buffer, const uint *src, int count)
|
||||
{
|
||||
int i = 0;
|
||||
const __m128i alphaMask = _mm_set1_epi32(0xff000000);
|
||||
const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
for (; i < count - 3; i += 4) {
|
||||
__m128i srcVector = _mm_loadu_si128((const __m128i *)&src[i]);
|
||||
if (!_mm_testz_si128(srcVector, alphaMask)) {
|
||||
if (!_mm_testc_si128(srcVector, alphaMask)) {
|
||||
__m128i srcVectorAlpha = _mm_srli_epi32(srcVector, 24);
|
||||
if (RGBA)
|
||||
srcVector = _mm_shuffle_epi8(srcVector, rgbaMask);
|
||||
const __m128 a = _mm_cvtepi32_ps(srcVectorAlpha);
|
||||
const __m128 ia = reciprocal_mul_ps(a, 255.0f);
|
||||
__m128i src1 = _mm_unpacklo_epi8(srcVector, zero);
|
||||
__m128i src3 = _mm_unpackhi_epi8(srcVector, zero);
|
||||
__m128i src2 = _mm_unpackhi_epi16(src1, zero);
|
||||
__m128i src4 = _mm_unpackhi_epi16(src3, zero);
|
||||
src1 = _mm_unpacklo_epi16(src1, zero);
|
||||
src3 = _mm_unpacklo_epi16(src3, zero);
|
||||
__m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
__m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
__m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
__m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src1), ia1));
|
||||
src2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src2), ia2));
|
||||
src3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src3), ia3));
|
||||
src4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src4), ia4));
|
||||
src1 = _mm_packus_epi32(src1, src2);
|
||||
src3 = _mm_packus_epi32(src3, src4);
|
||||
src1 = _mm_packus_epi16(src1, src3);
|
||||
// Handle potential alpha == 0 values:
|
||||
__m128i srcVectorAlphaMask = _mm_cmpeq_epi32(srcVectorAlpha, zero);
|
||||
src1 = _mm_andnot_si128(srcVectorAlphaMask, src1);
|
||||
// Fixup alpha values:
|
||||
if (RGBx)
|
||||
srcVector = _mm_or_si128(src1, alphaMask);
|
||||
else
|
||||
srcVector = _mm_blendv_epi8(src1, srcVector, alphaMask);
|
||||
_mm_storeu_si128((__m128i *)&buffer[i], srcVector);
|
||||
} else {
|
||||
if (RGBA)
|
||||
_mm_storeu_si128((__m128i *)&buffer[i], _mm_shuffle_epi8(srcVector, rgbaMask));
|
||||
else if (buffer != src)
|
||||
_mm_storeu_si128((__m128i *)&buffer[i], srcVector);
|
||||
}
|
||||
} else {
|
||||
if (RGBx)
|
||||
_mm_storeu_si128((__m128i *)&buffer[i], alphaMask);
|
||||
else
|
||||
_mm_storeu_si128((__m128i *)&buffer[i], zero);
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EPILOGUE(i, count, 3) {
|
||||
uint v = qUnpremultiply_sse4(src[i]);
|
||||
if (RGBx)
|
||||
v = 0xff000000 | v;
|
||||
if (RGBA)
|
||||
v = ARGB2RGBA(v);
|
||||
buffer[i] = v;
|
||||
}
|
||||
}
|
||||
|
||||
template<bool RGBA>
|
||||
static inline void convertARGBFromRGBA64PM_sse4(uint *buffer, const QRgba64 *src, int count)
|
||||
{
|
||||
int i = 0;
|
||||
const __m128i alphaMask = _mm_set1_epi64x(Q_UINT64_C(0xffff) << 48);
|
||||
const __m128i alphaMask32 = _mm_set1_epi32(0xff000000);
|
||||
const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
for (; i < count - 3; i += 4) {
|
||||
__m128i srcVector1 = _mm_loadu_si128((const __m128i *)&src[i]);
|
||||
__m128i srcVector2 = _mm_loadu_si128((const __m128i *)&src[i + 2]);
|
||||
bool transparent1 = _mm_testz_si128(srcVector1, alphaMask);
|
||||
bool opaque1 = _mm_testc_si128(srcVector1, alphaMask);
|
||||
bool transparent2 = _mm_testz_si128(srcVector2, alphaMask);
|
||||
bool opaque2 = _mm_testc_si128(srcVector2, alphaMask);
|
||||
|
||||
if (!(transparent1 && transparent2)) {
|
||||
if (!(opaque1 && opaque2)) {
|
||||
__m128i srcVector1Alpha = _mm_srli_epi64(srcVector1, 48);
|
||||
__m128i srcVector2Alpha = _mm_srli_epi64(srcVector2, 48);
|
||||
__m128i srcVectorAlpha = _mm_packus_epi32(srcVector1Alpha, srcVector2Alpha);
|
||||
const __m128 a = _mm_cvtepi32_ps(srcVectorAlpha);
|
||||
// Convert srcVectorAlpha to final 8-bit alpha channel
|
||||
srcVectorAlpha = _mm_add_epi32(srcVectorAlpha, _mm_set1_epi32(128));
|
||||
srcVectorAlpha = _mm_sub_epi32(srcVectorAlpha, _mm_srli_epi32(srcVectorAlpha, 8));
|
||||
srcVectorAlpha = _mm_srli_epi32(srcVectorAlpha, 8);
|
||||
srcVectorAlpha = _mm_slli_epi32(srcVectorAlpha, 24);
|
||||
const __m128 ia = reciprocal_mul_ps(a, 255.0f);
|
||||
__m128i src1 = _mm_unpacklo_epi16(srcVector1, zero);
|
||||
__m128i src2 = _mm_unpackhi_epi16(srcVector1, zero);
|
||||
__m128i src3 = _mm_unpacklo_epi16(srcVector2, zero);
|
||||
__m128i src4 = _mm_unpackhi_epi16(srcVector2, zero);
|
||||
__m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
__m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
__m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
__m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src1), ia1));
|
||||
src2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src2), ia2));
|
||||
src3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src3), ia3));
|
||||
src4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src4), ia4));
|
||||
src1 = _mm_packus_epi32(src1, src2);
|
||||
src3 = _mm_packus_epi32(src3, src4);
|
||||
// Handle potential alpha == 0 values:
|
||||
__m128i srcVector1AlphaMask = _mm_cmpeq_epi64(srcVector1Alpha, zero);
|
||||
__m128i srcVector2AlphaMask = _mm_cmpeq_epi64(srcVector2Alpha, zero);
|
||||
src1 = _mm_andnot_si128(srcVector1AlphaMask, src1);
|
||||
src3 = _mm_andnot_si128(srcVector2AlphaMask, src3);
|
||||
src1 = _mm_packus_epi16(src1, src3);
|
||||
// Fixup alpha values:
|
||||
src1 = _mm_blendv_epi8(src1, srcVectorAlpha, alphaMask32);
|
||||
// Fix RGB order
|
||||
if (!RGBA)
|
||||
src1 = _mm_shuffle_epi8(src1, rgbaMask);
|
||||
_mm_storeu_si128((__m128i *)&buffer[i], src1);
|
||||
} else {
|
||||
__m128i src1 = _mm_unpacklo_epi16(srcVector1, zero);
|
||||
__m128i src2 = _mm_unpackhi_epi16(srcVector1, zero);
|
||||
__m128i src3 = _mm_unpacklo_epi16(srcVector2, zero);
|
||||
__m128i src4 = _mm_unpackhi_epi16(srcVector2, zero);
|
||||
src1 = _mm_add_epi32(src1, _mm_set1_epi32(128));
|
||||
src2 = _mm_add_epi32(src2, _mm_set1_epi32(128));
|
||||
src3 = _mm_add_epi32(src3, _mm_set1_epi32(128));
|
||||
src4 = _mm_add_epi32(src4, _mm_set1_epi32(128));
|
||||
src1 = _mm_sub_epi32(src1, _mm_srli_epi32(src1, 8));
|
||||
src2 = _mm_sub_epi32(src2, _mm_srli_epi32(src2, 8));
|
||||
src3 = _mm_sub_epi32(src3, _mm_srli_epi32(src3, 8));
|
||||
src4 = _mm_sub_epi32(src4, _mm_srli_epi32(src4, 8));
|
||||
src1 = _mm_srli_epi32(src1, 8);
|
||||
src2 = _mm_srli_epi32(src2, 8);
|
||||
src3 = _mm_srli_epi32(src3, 8);
|
||||
src4 = _mm_srli_epi32(src4, 8);
|
||||
src1 = _mm_packus_epi32(src1, src2);
|
||||
src3 = _mm_packus_epi32(src3, src4);
|
||||
src1 = _mm_packus_epi16(src1, src3);
|
||||
if (!RGBA)
|
||||
src1 = _mm_shuffle_epi8(src1, rgbaMask);
|
||||
_mm_storeu_si128((__m128i *)&buffer[i], src1);
|
||||
}
|
||||
} else {
|
||||
_mm_storeu_si128((__m128i *)&buffer[i], zero);
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EPILOGUE(i, count, 3) {
|
||||
buffer[i] = qConvertRgba64ToRgb32_sse4<RGBA ? PixelOrderRGB : PixelOrderBGR>(src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, int count, const QVector<QRgb> *)
|
||||
{
|
||||
convertARGBToARGB32PM_sse4<false>(buffer, buffer, count);
|
||||
@ -121,32 +287,28 @@ void QT_FASTCALL storeRGB32FromARGB32PM_sse4(uchar *dest, const uint *src, int i
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
uint *d = reinterpret_cast<uint *>(dest) + index;
|
||||
for (int i = 0; i < count; ++i)
|
||||
d[i] = 0xff000000 | qUnpremultiply_sse4(src[i]);
|
||||
convertARGBFromARGB32PM_sse4<false,true>(d, src, count);
|
||||
}
|
||||
|
||||
void QT_FASTCALL storeARGB32FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
uint *d = reinterpret_cast<uint *>(dest) + index;
|
||||
for (int i = 0; i < count; ++i)
|
||||
d[i] = qUnpremultiply_sse4(src[i]);
|
||||
convertARGBFromARGB32PM_sse4<false,false>(d, src, count);
|
||||
}
|
||||
|
||||
void QT_FASTCALL storeRGBA8888FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
uint *d = reinterpret_cast<uint *>(dest) + index;
|
||||
for (int i = 0; i < count; ++i)
|
||||
d[i] = ARGB2RGBA(qUnpremultiply_sse4(src[i]));
|
||||
convertARGBFromARGB32PM_sse4<true,false>(d, src, count);
|
||||
}
|
||||
|
||||
void QT_FASTCALL storeRGBXFromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
uint *d = reinterpret_cast<uint *>(dest) + index;
|
||||
for (int i = 0; i < count; ++i)
|
||||
d[i] = ARGB2RGBA(0xff000000 | qUnpremultiply_sse4(src[i]));
|
||||
convertARGBFromARGB32PM_sse4<true,true>(d, src, count);
|
||||
}
|
||||
|
||||
template<QtPixelOrder PixelOrder>
|
||||
@ -158,6 +320,18 @@ void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4(uchar *dest, const uint *src, i
|
||||
d[i] = qConvertArgb32ToA2rgb30_sse4<PixelOrder>(src[i]);
|
||||
}
|
||||
|
||||
void QT_FASTCALL destStore64ARGB32_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length)
|
||||
{
|
||||
uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
|
||||
convertARGBFromRGBA64PM_sse4<false>(dest, buffer, length);
|
||||
}
|
||||
|
||||
void QT_FASTCALL destStore64RGBA8888_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length)
|
||||
{
|
||||
uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
|
||||
convertARGBFromRGBA64PM_sse4<true>(dest, buffer, length);
|
||||
}
|
||||
|
||||
template
|
||||
void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4<PixelOrderBGR>(uchar *dest, const uint *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
|
@ -43,6 +43,7 @@
|
||||
#include <QtGui/private/qtguiglobal_p.h>
|
||||
#include <private/qsimd_p.h>
|
||||
#include "qdrawhelper_p.h"
|
||||
#include "qrgba64_p.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
@ -230,21 +231,31 @@ QT_END_NAMESPACE
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
|
||||
QT_FUNCTION_TARGET(SSE2)
|
||||
Q_ALWAYS_INLINE void reciprocal_mul_ss(__m128 &ia, const __m128 a, float mul)
|
||||
{
|
||||
ia = _mm_rcp_ss(a); // Approximate 1/a
|
||||
// Improve precision of ia using Newton-Raphson
|
||||
ia = _mm_sub_ss(_mm_add_ss(ia, ia), _mm_mul_ss(ia, _mm_mul_ss(ia, a)));
|
||||
ia = _mm_mul_ss(ia, _mm_set_ss(mul));
|
||||
ia = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0,0,0,0));
|
||||
}
|
||||
|
||||
QT_FUNCTION_TARGET(SSE4_1)
|
||||
inline QRgb qUnpremultiply_sse4(QRgb p)
|
||||
{
|
||||
const uint alpha = qAlpha(p);
|
||||
if (alpha == 255 || alpha == 0)
|
||||
if (alpha == 255)
|
||||
return p;
|
||||
const uint invAlpha = qt_inv_premul_factor[alpha];
|
||||
const __m128i via = _mm_set1_epi32(invAlpha);
|
||||
const __m128i vr = _mm_set1_epi32(0x8000);
|
||||
if (alpha == 0)
|
||||
return 0;
|
||||
const __m128 va = _mm_set1_ps(alpha);
|
||||
__m128 via;
|
||||
reciprocal_mul_ss(via, va, 255.0f); // Approximate 1/a
|
||||
__m128i vl = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p));
|
||||
vl = _mm_mullo_epi32(vl, via);
|
||||
vl = _mm_add_epi32(vl, vr);
|
||||
vl = _mm_srai_epi32(vl, 16);
|
||||
vl = _mm_insert_epi32(vl, alpha, 3);
|
||||
vl = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(vl), via));
|
||||
vl = _mm_packus_epi32(vl, vl);
|
||||
vl = _mm_insert_epi16(vl, alpha, 3);
|
||||
vl = _mm_packus_epi16(vl, vl);
|
||||
return _mm_cvtsi128_si32(vl);
|
||||
}
|
||||
@ -258,21 +269,14 @@ inline uint qConvertArgb32ToA2rgb30_sse4(QRgb p)
|
||||
return qConvertRgb32ToRgb30<PixelOrder>(p);
|
||||
if (alpha == 0)
|
||||
return 0;
|
||||
Q_CONSTEXPR uint mult = 255 / (255 >> 6);
|
||||
const uint invAlpha = qt_inv_premul_factor[alpha];
|
||||
Q_CONSTEXPR float mult = 1023.0f / (255 >> 6);
|
||||
const uint newalpha = (alpha >> 6);
|
||||
const __m128i via = _mm_set1_epi32(invAlpha);
|
||||
const __m128i vna = _mm_set1_epi32(mult * newalpha);
|
||||
const __m128i vr1 = _mm_set1_epi32(0x1000);
|
||||
const __m128i vr2 = _mm_set1_epi32(0x80);
|
||||
__m128i vl = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p));
|
||||
vl = _mm_mullo_epi32(vl, via);
|
||||
vl = _mm_add_epi32(vl, vr1);
|
||||
vl = _mm_srli_epi32(vl, 14);
|
||||
vl = _mm_mullo_epi32(vl, vna);
|
||||
vl = _mm_add_epi32(vl, _mm_srli_epi32(vl, 8));
|
||||
vl = _mm_add_epi32(vl, vr2);
|
||||
vl = _mm_srli_epi32(vl, 8);
|
||||
const __m128 va = _mm_set1_ps(alpha);
|
||||
__m128 via;
|
||||
reciprocal_mul_ss(via, va, mult * newalpha);
|
||||
__m128i vl = _mm_cvtsi32_si128(p);
|
||||
vl = _mm_cvtepu8_epi32(vl);
|
||||
vl = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(vl), via));
|
||||
vl = _mm_packus_epi32(vl, vl);
|
||||
uint rgb30 = (newalpha << 30);
|
||||
rgb30 |= ((uint)_mm_extract_epi16(vl, 1)) << 10;
|
||||
@ -285,6 +289,27 @@ inline uint qConvertArgb32ToA2rgb30_sse4(QRgb p)
|
||||
}
|
||||
return rgb30;
|
||||
}
|
||||
|
||||
template<enum QtPixelOrder PixelOrder>
|
||||
QT_FUNCTION_TARGET(SSE4_1)
|
||||
inline uint qConvertRgba64ToRgb32_sse4(QRgba64 p)
|
||||
{
|
||||
if (p.isTransparent())
|
||||
return 0;
|
||||
__m128i vl = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&p));
|
||||
if (!p.isOpaque()) {
|
||||
const __m128 va = _mm_set1_ps(p.alpha());
|
||||
__m128 via;
|
||||
reciprocal_mul_ss(via, va, 65535.0f);
|
||||
vl = _mm_unpacklo_epi16(vl, _mm_setzero_si128());
|
||||
vl = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(vl) , via));
|
||||
vl = _mm_packus_epi32(vl, vl);
|
||||
vl = _mm_insert_epi16(vl, p.alpha(), 3);
|
||||
}
|
||||
if (PixelOrder == PixelOrderBGR)
|
||||
vl = _mm_shufflelo_epi16(vl, _MM_SHUFFLE(3, 0, 1, 2));
|
||||
return toArgb32(vl);
|
||||
}
|
||||
#endif
|
||||
QT_END_NAMESPACE
|
||||
|
||||
|
@ -127,6 +127,10 @@ public:
|
||||
|
||||
Q_DECL_RELAXED_CONSTEXPR QRgba64 premultiplied() const
|
||||
{
|
||||
if (isOpaque())
|
||||
return *this;
|
||||
if (isTransparent())
|
||||
return QRgba64::fromRgba64(0);
|
||||
const quint32 a = alpha();
|
||||
const quint16 r = div_65535(red() * a);
|
||||
const quint16 g = div_65535(green() * a);
|
||||
|
@ -186,7 +186,8 @@ inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
|
||||
qMin(a.alpha() + b.alpha(), 65535));
|
||||
}
|
||||
|
||||
#if defined __SSE2__
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE2)
|
||||
QT_FUNCTION_TARGET(SSE2)
|
||||
Q_ALWAYS_INLINE uint toArgb32(__m128i v)
|
||||
{
|
||||
v = _mm_unpacklo_epi16(v, _mm_setzero_si128());
|
||||
|
@ -1492,10 +1492,28 @@ void tst_QColor::unpremultiply_sse4()
|
||||
// Tests that qUnpremultiply_sse4 returns the same as qUnpremultiply.
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
|
||||
if (qCpuHasFeature(SSE4_1)) {
|
||||
int minorDifferences = 0;
|
||||
for (uint a = 0; a < 256; a++) {
|
||||
for (uint c = 0; c <= a; c++) {
|
||||
const QRgb p = qRgba(c, a-c, c/2, a);
|
||||
const uint u = qUnpremultiply(p);
|
||||
const uint usse4 = qUnpremultiply_sse4(p);
|
||||
if (u != usse4) {
|
||||
QCOMPARE(qAlpha(u), qAlpha(usse4));
|
||||
QVERIFY(qAbs(qRed(u) - qRed(usse4)) <= 1);
|
||||
QVERIFY(qAbs(qGreen(u) - qGreen(usse4)) <= 1);
|
||||
QVERIFY(qAbs(qBlue(u) - qBlue(usse4)) <= 1);
|
||||
++minorDifferences;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Allow a few rounding differences as long as it still obeys
|
||||
// the qPremultiply(qUnpremultiply(x)) == x invariant
|
||||
QVERIFY(minorDifferences <= 16 * 255);
|
||||
for (uint a = 0; a < 256; a++) {
|
||||
for (uint c = 0; c <= a; c++) {
|
||||
QRgb p = qRgba(c, a-c, c, a);
|
||||
QCOMPARE(qUnpremultiply(p), qUnpremultiply_sse4(p));
|
||||
QCOMPARE(p, qPremultiply(qUnpremultiply_sse4(p)));
|
||||
}
|
||||
}
|
||||
return;
|
||||
|
Loading…
Reference in New Issue
Block a user