Revert "Remove separate SSE4 unpremultiply function"
Could causedSSE4 instructions to be used on non SSE4 machines
in cases when qUnpremultiplywas not inlined.
This reverts commit 964ccc5853
.
Change-Id: Ic676ade8f75129e8d37c4d96cbfb2bdb5b794919
Task-number: QTBUG-45741
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
6535912add
commit
63d5a42b59
@ -117,7 +117,7 @@ static const uint *QT_FASTCALL convertRGB32ToARGB32PM(uint *buffer, const uint *
|
||||
return buffer;
|
||||
}
|
||||
|
||||
#if defined(QT_COMPILER_SUPPORTS_SSE4_1) && !defined(__SSE4_1__)
|
||||
#ifdef QT_COMPILER_SUPPORTS_SSE4_1
|
||||
extern const uint *QT_FASTCALL convertRGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
|
||||
#endif
|
||||
|
||||
@ -144,7 +144,7 @@ void convert_generic(QImageData *dest, const QImageData *src, Qt::ImageConversio
|
||||
if (src->format == QImage::Format_RGB32)
|
||||
convertToARGB32PM = convertRGB32ToARGB32PM;
|
||||
if (dest->format == QImage::Format_RGB32) {
|
||||
#if defined(QT_COMPILER_SUPPORTS_SSE4_1) && !defined(__SSE4_1__)
|
||||
#ifdef QT_COMPILER_SUPPORTS_SSE4_1
|
||||
if (qCpuHasFeature(SSE4_1))
|
||||
convertFromARGB32PM = convertRGB32FromARGB32PM_sse4;
|
||||
else
|
||||
@ -193,7 +193,7 @@ bool convert_generic_inplace(QImageData *data, QImage::Format dst_format, Qt::Im
|
||||
if (data->format == QImage::Format_RGB32)
|
||||
convertToARGB32PM = convertRGB32ToARGB32PM;
|
||||
if (dst_format == QImage::Format_RGB32) {
|
||||
#if defined(QT_COMPILER_SUPPORTS_SSE4_1) && !defined(__SSE4_1__)
|
||||
#ifdef QT_COMPILER_SUPPORTS_SSE4_1
|
||||
if (qCpuHasFeature(SSE4_1))
|
||||
convertFromARGB32PM = convertRGB32FromARGB32PM_sse4;
|
||||
else
|
||||
|
@ -33,6 +33,7 @@
|
||||
|
||||
#include <qimage.h>
|
||||
#include <private/qdrawhelper_p.h>
|
||||
#include <private/qdrawingprimitive_sse2_p.h>
|
||||
#include <private/qimage_p.h>
|
||||
#include <private/qsimd_p.h>
|
||||
|
||||
@ -44,7 +45,7 @@ const uint *QT_FASTCALL convertRGB32FromARGB32PM_sse4(uint *buffer, const uint *
|
||||
const QPixelLayout *, const QRgb *)
|
||||
{
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = 0xff000000 | qUnpremultiply(src[i]);
|
||||
buffer[i] = 0xff000000 | qUnpremultiply_sse4(src[i]);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
|
@ -6704,22 +6704,24 @@ void qInitDrawhelperAsm()
|
||||
}
|
||||
#endif // SSSE3
|
||||
|
||||
#if defined(QT_COMPILER_SUPPORTS_SSE4_1) && !defined(__SSE4_1__)
|
||||
#if QT_COMPILER_SUPPORTS_SSE4_1
|
||||
if (qCpuHasFeature(SSE4_1)) {
|
||||
#if !defined(__SSE4_1__)
|
||||
extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
|
||||
extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
|
||||
qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4;
|
||||
#endif
|
||||
extern const uint *QT_FASTCALL convertARGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
|
||||
extern const uint *QT_FASTCALL convertRGBA8888FromARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
|
||||
extern const uint *QT_FASTCALL convertRGBXFromARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
|
||||
qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_ARGB32].convertFromARGB32PM = convertARGB32FromARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].convertFromARGB32PM = convertRGBA8888FromARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBX8888].convertFromARGB32PM = convertRGBXFromARGB32PM_sse4;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(QT_COMPILER_SUPPORTS_AVX2) && !defined(__AVX2__)
|
||||
#if QT_COMPILER_SUPPORTS_AVX2 && !defined(__AVX2__)
|
||||
if (qCpuHasFeature(AVX2)) {
|
||||
extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
|
||||
extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
|
||||
|
@ -32,6 +32,7 @@
|
||||
****************************************************************************/
|
||||
|
||||
#include <private/qdrawhelper_p.h>
|
||||
#include <private/qdrawingprimitive_sse2_p.h>
|
||||
|
||||
#if defined(QT_COMPILER_SUPPORTS_SSE4_1)
|
||||
|
||||
@ -53,7 +54,7 @@ const uint *QT_FASTCALL convertARGB32FromARGB32PM_sse4(uint *buffer, const uint
|
||||
const QPixelLayout *, const QRgb *)
|
||||
{
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = qUnpremultiply(src[i]);
|
||||
buffer[i] = qUnpremultiply_sse4(src[i]);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
@ -61,7 +62,7 @@ const uint *QT_FASTCALL convertRGBA8888FromARGB32PM_sse4(uint *buffer, const uin
|
||||
const QPixelLayout *, const QRgb *)
|
||||
{
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = ARGB2RGBA(qUnpremultiply(src[i]));
|
||||
buffer[i] = ARGB2RGBA(qUnpremultiply_sse4(src[i]));
|
||||
return buffer;
|
||||
}
|
||||
|
||||
@ -69,7 +70,7 @@ const uint *QT_FASTCALL convertRGBXFromARGB32PM_sse4(uint *buffer, const uint *s
|
||||
const QPixelLayout *, const QRgb *)
|
||||
{
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = ARGB2RGBA(0xff000000 | qUnpremultiply(src[i]));
|
||||
buffer[i] = ARGB2RGBA(0xff000000 | qUnpremultiply_sse4(src[i]));
|
||||
return buffer;
|
||||
}
|
||||
|
||||
|
@ -236,4 +236,25 @@ QT_END_NAMESPACE
|
||||
|
||||
#endif // __SSE2__
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
|
||||
QT_FUNCTION_TARGET(SSE4_1)
|
||||
inline QRgb qUnpremultiply_sse4(QRgb p)
|
||||
{
|
||||
const uint alpha = qAlpha(p);
|
||||
const uint invAlpha = qt_inv_premul_factor[alpha];
|
||||
const __m128i via = _mm_set1_epi32(invAlpha);
|
||||
const __m128i vr = _mm_set1_epi32(0x8000);
|
||||
__m128i vl = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p));
|
||||
vl = _mm_mullo_epi32(vl, via);
|
||||
vl = _mm_add_epi32(vl, vr);
|
||||
vl = _mm_srai_epi32(vl, 16);
|
||||
vl = _mm_insert_epi32(vl, alpha, 3);
|
||||
vl = _mm_packus_epi32(vl, _mm_setzero_si128());
|
||||
vl = _mm_packus_epi16(vl, _mm_setzero_si128());
|
||||
return _mm_cvtsi128_si32(vl);
|
||||
}
|
||||
#endif
|
||||
QT_END_NAMESPACE
|
||||
|
||||
#endif // QDRAWINGPRIMITIVE_SSE2_P_H
|
||||
|
@ -36,11 +36,6 @@
|
||||
|
||||
#include <QtCore/qglobal.h>
|
||||
#include <QtCore/qprocessordetection.h>
|
||||
#if defined(__SSE4_1__)
|
||||
#include <smmintrin.h>
|
||||
#elif defined(__SSE2__)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
@ -92,45 +87,19 @@ inline Q_DECL_RELAXED_CONSTEXPR QRgb qPremultiply(QRgb x)
|
||||
|
||||
Q_GUI_EXPORT extern const uint qt_inv_premul_factor[];
|
||||
|
||||
#if defined(__SSE2__)
|
||||
inline QRgb qUnpremultiply(QRgb p)
|
||||
{
|
||||
const uint alpha = qAlpha(p);
|
||||
if (alpha == 255 || alpha == 0)
|
||||
return p;
|
||||
const uint invAlpha = qt_inv_premul_factor[alpha];
|
||||
const __m128i via = _mm_set1_epi32(invAlpha);
|
||||
const __m128i vr = _mm_set1_epi32(0x8000);
|
||||
#ifdef __SSE4_1__
|
||||
__m128i vl = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p));
|
||||
vl = _mm_mullo_epi32(vl, via);
|
||||
#else
|
||||
__m128i vl = _mm_unpacklo_epi8(_mm_cvtsi32_si128(p), _mm_setzero_si128());
|
||||
vl = _mm_unpacklo_epi16(vl, vl);
|
||||
__m128i vll = _mm_mullo_epi16(vl, via);
|
||||
__m128i vlh = _mm_mulhi_epu16(vl, via);
|
||||
vl = _mm_add_epi32(vll, _mm_slli_epi32(vlh, 16));
|
||||
#endif
|
||||
vl = _mm_add_epi32(vl, vr);
|
||||
vl = _mm_srli_epi32(vl, 16);
|
||||
vl = _mm_packs_epi32(vl, _mm_setzero_si128());
|
||||
vl = _mm_insert_epi16(vl, alpha, 3);
|
||||
vl = _mm_packus_epi16(vl, _mm_setzero_si128());
|
||||
return _mm_cvtsi128_si32(vl);
|
||||
}
|
||||
#else
|
||||
inline QRgb qUnpremultiply(QRgb p)
|
||||
{
|
||||
const uint alpha = qAlpha(p);
|
||||
// Alpha 255 and 0 are the two most common values, which makes them beneficial to short-cut.
|
||||
if (alpha == 255 || alpha == 0)
|
||||
if (alpha == 255)
|
||||
return p;
|
||||
if (alpha == 0)
|
||||
return 0;
|
||||
// (p*(0x00ff00ff/alpha)) >> 16 == (p*255)/alpha for all p and alpha <= 256.
|
||||
const uint invAlpha = qt_inv_premul_factor[alpha];
|
||||
// We add 0x8000 to get even rounding. The rounding also ensures that qPremultiply(qUnpremultiply(p)) == p for all p.
|
||||
return qRgba((qRed(p)*invAlpha + 0x8000)>>16, (qGreen(p)*invAlpha + 0x8000)>>16, (qBlue(p)*invAlpha + 0x8000)>>16, alpha);
|
||||
}
|
||||
#endif
|
||||
|
||||
QT_END_NAMESPACE
|
||||
|
||||
|
@ -38,6 +38,7 @@
|
||||
|
||||
#include <qcolor.h>
|
||||
#include <qdebug.h>
|
||||
#include <private/qdrawingprimitive_sse2_p.h>
|
||||
|
||||
class tst_QColor : public QObject
|
||||
{
|
||||
@ -103,6 +104,7 @@ private slots:
|
||||
void achromaticHslHue();
|
||||
|
||||
void premultiply();
|
||||
void unpremultiply_sse4();
|
||||
|
||||
#ifdef Q_DEAD_CODE_FROM_QT4_X11
|
||||
void setallowX11ColorNames();
|
||||
@ -1445,5 +1447,22 @@ void tst_QColor::premultiply()
|
||||
}
|
||||
}
|
||||
|
||||
void tst_QColor::unpremultiply_sse4()
|
||||
{
|
||||
// Tests that qUnpremultiply_sse4 returns the same as qUnpremultiply.
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
|
||||
if (qCpuHasFeature(SSE4_1)) {
|
||||
for (uint a = 0; a < 256; a++) {
|
||||
for (uint c = 0; c <= a; c++) {
|
||||
QRgb p = qRgba(c, a-c, c, a);
|
||||
QCOMPARE(qUnpremultiply(p), qUnpremultiply_sse4(p));
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
QSKIP("SSE4 not supported on this CPU.");
|
||||
}
|
||||
|
||||
QTEST_MAIN(tst_QColor)
|
||||
#include "tst_qcolor.moc"
|
||||
|
Loading…
Reference in New Issue
Block a user