Optimize unpremultiply on SSE4.1
Adds an SSE4.1 optimized version of qUnpremultiply and uses it in the most drawing conversions methods. This gives a speed-up of little over 2x. Change-Id: Ieb858a94ada1eb86d7af715ac1a100f1587f360d Reviewed-by: Gunnar Sletta <gunnar@sletta.org>
This commit is contained in:
parent
868201155f
commit
38aafe1a17
@ -45,6 +45,7 @@
|
||||
#include <private/qpaintengine_raster_p.h>
|
||||
#include <private/qpainter_p.h>
|
||||
#include <private/qdrawhelper_x86_p.h>
|
||||
#include <private/qdrawingprimitive_sse2_p.h>
|
||||
#include <private/qdrawhelper_neon_p.h>
|
||||
#if defined(QT_COMPILER_SUPPORTS_MIPS_DSP) || defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2)
|
||||
#include <private/qdrawhelper_mips_dsp_p.h>
|
||||
@ -572,6 +573,18 @@ static const uint *QT_FASTCALL convertARGB32FromARGB32PM(uint *buffer, const uin
|
||||
return buffer;
|
||||
}
|
||||
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
|
||||
QT_FUNCTION_TARGET(SSE4_1)
|
||||
static const uint *QT_FASTCALL convertARGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count,
|
||||
const QPixelLayout *, const QRgb *)
|
||||
{
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = qUnpremultiply_sse4(src[i]);
|
||||
return buffer;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static const uint *QT_FASTCALL convertRGBA8888PMFromARGB32PM(uint *buffer, const uint *src, int count,
|
||||
const QPixelLayout *, const QRgb *)
|
||||
{
|
||||
@ -588,6 +601,17 @@ static const uint *QT_FASTCALL convertRGBA8888FromARGB32PM(uint *buffer, const u
|
||||
return buffer;
|
||||
}
|
||||
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
|
||||
QT_FUNCTION_TARGET(SSE4_1)
|
||||
static const uint *QT_FASTCALL convertRGBA8888FromARGB32PM_sse4(uint *buffer, const uint *src, int count,
|
||||
const QPixelLayout *, const QRgb *)
|
||||
{
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = ARGB2RGBA(qUnpremultiply_sse4(src[i]));
|
||||
return buffer;
|
||||
}
|
||||
#endif
|
||||
|
||||
static const uint *QT_FASTCALL convertRGBXFromRGB32(uint *buffer, const uint *src, int count,
|
||||
const QPixelLayout *, const QRgb *)
|
||||
{
|
||||
@ -604,6 +628,17 @@ static const uint *QT_FASTCALL convertRGBXFromARGB32PM(uint *buffer, const uint
|
||||
return buffer;
|
||||
}
|
||||
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
|
||||
QT_FUNCTION_TARGET(SSE4_1)
|
||||
static const uint *QT_FASTCALL convertRGBXFromARGB32PM_sse4(uint *buffer, const uint *src, int count,
|
||||
const QPixelLayout *, const QRgb *)
|
||||
{
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = ARGB2RGBA(0xff000000 | qUnpremultiply_sse4(src[i]));
|
||||
return buffer;
|
||||
}
|
||||
#endif
|
||||
|
||||
template<QtPixelOrder PixelOrder>
|
||||
static const uint *QT_FASTCALL convertA2RGB30PMToARGB32PM(uint *buffer, const uint *src, int count,
|
||||
const QPixelLayout *, const QRgb *)
|
||||
@ -6879,10 +6914,15 @@ void qInitDrawhelperAsm()
|
||||
}
|
||||
#endif // SSSE3
|
||||
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) && !defined(__SSE4_1__)
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
|
||||
if (qCpuHasFeature(SSE4_1)) {
|
||||
#if !defined(__SSE4_1__)
|
||||
qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4;
|
||||
#endif
|
||||
qPixelLayouts[QImage::Format_ARGB32].convertFromARGB32PM = convertARGB32FromARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].convertFromARGB32PM = convertRGBA8888FromARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBX8888].convertFromARGB32PM = convertRGBXFromARGB32PM_sse4;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -236,4 +236,26 @@ QT_END_NAMESPACE
|
||||
|
||||
#endif // __SSE2__
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
|
||||
QT_FUNCTION_TARGET(SSE4_1)
|
||||
inline QRgb qUnpremultiply_sse4(QRgb p)
|
||||
{
|
||||
const uint alpha = qAlpha(p);
|
||||
const uint invAlpha = qt_inv_premul_factor[alpha];
|
||||
const __m128i via = _mm_set1_epi32(invAlpha);
|
||||
const __m128i vr = _mm_set1_epi32(0x8000);
|
||||
__m128i vl = _mm_unpacklo_epi8(_mm_cvtsi32_si128(p), _mm_setzero_si128());
|
||||
vl = _mm_unpacklo_epi16(vl, _mm_setzero_si128());
|
||||
vl = _mm_mullo_epi32(vl, via);
|
||||
vl = _mm_add_epi32(vl, vr);
|
||||
vl = _mm_srai_epi32(vl, 16);
|
||||
vl = _mm_insert_epi32(vl, alpha, 3);
|
||||
vl = _mm_packus_epi32(vl, _mm_setzero_si128());
|
||||
vl = _mm_packus_epi16(vl, _mm_setzero_si128());
|
||||
return _mm_cvtsi128_si32(vl);
|
||||
}
|
||||
#endif
|
||||
QT_END_NAMESPACE
|
||||
|
||||
#endif // QDRAWINGPRIMITIVE_SSE2_P_H
|
||||
|
@ -2,4 +2,4 @@ CONFIG += testcase
|
||||
CONFIG += parallel_test
|
||||
TARGET = tst_qcolor
|
||||
SOURCES += tst_qcolor.cpp
|
||||
QT += testlib
|
||||
QT += testlib gui-private core-private
|
||||
|
@ -38,6 +38,7 @@
|
||||
|
||||
#include <qcolor.h>
|
||||
#include <qdebug.h>
|
||||
#include <private/qdrawingprimitive_sse2_p.h>
|
||||
|
||||
class tst_QColor : public QObject
|
||||
{
|
||||
@ -102,6 +103,9 @@ private slots:
|
||||
|
||||
void achromaticHslHue();
|
||||
|
||||
void premultiply();
|
||||
void unpremultiply_sse4();
|
||||
|
||||
#ifdef Q_DEAD_CODE_FROM_QT4_X11
|
||||
void setallowX11ColorNames();
|
||||
#endif
|
||||
@ -1432,5 +1436,33 @@ void tst_QColor::setallowX11ColorNames()
|
||||
}
|
||||
#endif
|
||||
|
||||
void tst_QColor::premultiply()
|
||||
{
|
||||
// Tests that qPremultiply(qUnpremultiply(x)) returns x.
|
||||
for (uint a = 0; a < 256; a++) {
|
||||
for (uint c = 0; c <= a; c++) {
|
||||
QRgb p = qRgba(c, a-c, c, a);
|
||||
QCOMPARE(p, qPremultiply(qUnpremultiply(p)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void tst_QColor::unpremultiply_sse4()
|
||||
{
|
||||
// Tests that qUnpremultiply_sse4 returns the same as qUnpremultiply.
|
||||
#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
|
||||
if (qCpuHasFeature(SSE4_1)) {
|
||||
for (uint a = 0; a < 256; a++) {
|
||||
for (uint c = 0; c <= a; c++) {
|
||||
QRgb p = qRgba(c, a-c, c, a);
|
||||
QCOMPARE(qUnpremultiply(p), qUnpremultiply_sse4(p));
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
QSKIP("SSE4 not supported on this CPU.");
|
||||
}
|
||||
|
||||
QTEST_MAIN(tst_QColor)
|
||||
#include "tst_qcolor.moc"
|
||||
|
Loading…
Reference in New Issue
Block a user