Optimize ARGB32->RGBA64PM better
This conversion is critical for ARGB32 painting, and no compiler optimized the premultiplication efficiently. Change-Id: Iee137c2f7020246478d09e880a7a1bf2ed3c6fd4 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
1f2c23a7ca
commit
4e6a42cdd0
@ -1086,18 +1086,8 @@ static const QRgba64 *QT_FASTCALL fetchRGB32ToRGB64(QRgba64 *buffer, const uchar
|
||||
static const QRgba64 *QT_FASTCALL convertARGB32ToRGBA64PM(QRgba64 *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
#ifdef __SSE2__
|
||||
qConvertARGB32PMToRGBA64PM_sse2<false, false>(buffer, src, count);
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = buffer[i].premultiplied();
|
||||
#elif defined(__ARM_NEON__)
|
||||
qConvertARGB32PMToRGBA64PM_neon<false, false>(buffer, src, count);
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = buffer[i].premultiplied();
|
||||
#else
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = QRgba64::fromArgb32(src[i]).premultiplied();
|
||||
#endif
|
||||
return buffer;
|
||||
}
|
||||
|
||||
@ -1149,18 +1139,8 @@ static const QRgba64 *QT_FASTCALL fetchRGBA64ToRGBA64PM(QRgba64 *buffer, const u
|
||||
static const QRgba64 *QT_FASTCALL convertRGBA8888ToRGBA64PM(QRgba64 *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
#ifdef __SSE2__
|
||||
qConvertARGB32PMToRGBA64PM_sse2<true, false>(buffer, src, count);
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = buffer[i].premultiplied();
|
||||
#elif defined(__ARM_NEON__)
|
||||
qConvertARGB32PMToRGBA64PM_neon<true, false>(buffer, src, count);
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = buffer[i].premultiplied();
|
||||
#else
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = QRgba64::fromArgb32(RGBA2ARGB(src[i])).premultiplied();
|
||||
#endif
|
||||
return buffer;
|
||||
}
|
||||
|
||||
@ -6514,6 +6494,14 @@ static void qInitDrawhelperFunctions()
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_sse4(uint *buffer, const uchar *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_sse4(QRgba64 *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_sse4(QRgba64 *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_sse4(QRgba64 *buffer, const uchar *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_sse4(QRgba64 *buffer, const uchar *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern void QT_FASTCALL storeARGB32FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern void QT_FASTCALL storeRGBA8888FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
|
||||
@ -6530,8 +6518,14 @@ static void qInitDrawhelperFunctions()
|
||||
qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].fetchToARGB32PM = fetchRGBA8888ToARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_ARGB32].fetchToRGBA64PM = fetchARGB32ToRGBA64PM_sse4;
|
||||
qPixelLayouts[QImage::Format_ARGB32].convertToRGBA64PM = convertARGB32ToRGBA64PM_sse4;
|
||||
qPixelLayouts[QImage::Format_ARGB32].storeFromARGB32PM = storeARGB32FromARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].fetchToRGBA64PM = fetchRGBA8888ToRGBA64PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].convertToRGBA64PM = convertRGBA8888ToRGBA64PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].storeFromARGB32PM = storeRGBA8888FromARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBX8888].fetchToRGBA64PM = fetchRGBA8888ToRGBA64PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBX8888].convertToRGBA64PM = convertRGBA8888ToRGBA64PM_sse4;
|
||||
qPixelLayouts[QImage::Format_RGBX8888].storeFromARGB32PM = storeRGBXFromARGB32PM_sse4;
|
||||
qPixelLayouts[QImage::Format_A2BGR30_Premultiplied].storeFromARGB32PM = storeA2RGB30PMFromARGB32PM_sse4<PixelOrderBGR>;
|
||||
qPixelLayouts[QImage::Format_A2RGB30_Premultiplied].storeFromARGB32PM = storeA2RGB30PMFromARGB32PM_sse4<PixelOrderRGB>;
|
||||
@ -6620,6 +6614,14 @@ static void qInitDrawhelperFunctions()
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_neon(uint *buffer, const uchar *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uchar *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_neon(QRgba64 *buffer, const uchar *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern void QT_FASTCALL storeARGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *);
|
||||
extern void QT_FASTCALL storeRGBA8888FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
|
||||
@ -6629,10 +6631,16 @@ static void qInitDrawhelperFunctions()
|
||||
qPixelLayouts[QImage::Format_ARGB32].fetchToARGB32PM = fetchARGB32ToARGB32PM_neon;
|
||||
qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_neon;
|
||||
qPixelLayouts[QImage::Format_ARGB32].storeFromARGB32PM = storeARGB32FromARGB32PM_neon;
|
||||
qPixelLayouts[QImage::Format_ARGB32].fetchToRGBA64PM = fetchARGB32ToRGBA64PM_neon;
|
||||
qPixelLayouts[QImage::Format_ARGB32].convertToRGBA64PM = convertARGB32ToRGBA64PM_neon;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].fetchToARGB32PM = fetchRGBA8888ToARGB32PM_neon;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_neon;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].storeFromARGB32PM = storeRGBA8888FromARGB32PM_neon;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].fetchToRGBA64PM = fetchRGBA8888ToRGBA64PM_neon;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].convertToRGBA64PM = convertRGBA8888ToRGBA64PM_neon;
|
||||
qPixelLayouts[QImage::Format_RGBX8888].storeFromARGB32PM = storeRGBXFromARGB32PM_neon;
|
||||
qPixelLayouts[QImage::Format_RGBX8888].fetchToRGBA64PM = fetchRGBA8888ToRGBA64PM_neon;
|
||||
qPixelLayouts[QImage::Format_RGBX8888].convertToRGBA64PM = convertRGBA8888ToRGBA64PM_neon;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_PIXMAN_DRAWHELPERS)
|
||||
|
@ -1149,6 +1149,72 @@ static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int
|
||||
}
|
||||
}
|
||||
|
||||
template<bool RGBA>
|
||||
static inline void convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count)
|
||||
{
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
|
||||
const uint64x2_t blendMask = vdupq_n_u64(Q_UINT64_C(0xffff000000000000));
|
||||
|
||||
int i = 0;
|
||||
for (; i < count-3; i += 4) {
|
||||
uint32x4_t vs32 = vld1q_u32(src + i);
|
||||
uint32x4_t alphaVector = vshrq_n_u32(vs32, 24);
|
||||
#if defined(Q_PROCESSOR_ARM_64)
|
||||
uint32_t alphaSum = vaddvq_u32(alphaVector);
|
||||
#else
|
||||
// no vaddvq_u32
|
||||
uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector));
|
||||
uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0);
|
||||
#endif
|
||||
if (alphaSum) {
|
||||
if (!RGBA)
|
||||
vs32 = vrgba2argb(vs32);
|
||||
const uint8x16_t vs8 = vreinterpretq_u8_u32(vs32);
|
||||
const uint8x16x2_t v = vzipq_u8(vs8, vs8);
|
||||
if (alphaSum != 255 * 4) {
|
||||
const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(vs32));
|
||||
const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(vs32));
|
||||
const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask);
|
||||
const uint8x8_t alpha2 = vtbl1_u8(s2, shuffleMask);
|
||||
uint16x8_t src1 = vmull_u8(s1, alpha1);
|
||||
uint16x8_t src2 = vmull_u8(s2, alpha2);
|
||||
// convert from 0->(255x255) to 0->(255x257)
|
||||
src1 = vsraq_n_u16(src1, src1, 7);
|
||||
src2 = vsraq_n_u16(src2, src2, 7);
|
||||
|
||||
// now restore alpha from the trivial conversion
|
||||
const uint64x2_t d1 = vbslq_u64(blendMask, vreinterpretq_u64_u8(v.val[0]), vreinterpretq_u64_u16(src1));
|
||||
const uint64x2_t d2 = vbslq_u64(blendMask, vreinterpretq_u64_u8(v.val[1]), vreinterpretq_u64_u16(src2));
|
||||
|
||||
vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u64(d1));
|
||||
buffer += 2;
|
||||
vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u64(d2));
|
||||
buffer += 2;
|
||||
} else {
|
||||
vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[0]));
|
||||
buffer += 2;
|
||||
vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[1]));
|
||||
buffer += 2;
|
||||
}
|
||||
} else {
|
||||
vst1q_u16((uint16_t *)buffer, vdupq_n_u16(0));
|
||||
buffer += 2;
|
||||
vst1q_u16((uint16_t *)buffer, vdupq_n_u16(0));
|
||||
buffer += 2;
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EPILOGUE(i, count, 3) {
|
||||
uint s = src[i];
|
||||
if (RGBA)
|
||||
s = RGBA2ARGB(s);
|
||||
*buffer++ = QRgba64::fromArgb32(s).premultiplied();
|
||||
}
|
||||
}
|
||||
|
||||
static inline float32x4_t reciprocal_mul_ps(float32x4_t a, float mul)
|
||||
{
|
||||
float32x4_t ia = vrecpeq_f32(a); // estimate 1/a
|
||||
@ -1258,6 +1324,34 @@ const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_neon(uint *buffer, const uchar *
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
convertARGB32ToRGBA64PM_neon<false>(buffer, src, count);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
convertARGB32ToRGBA64PM_neon<true>(buffer, src, count);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uchar *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
convertARGB32ToRGBA64PM_neon<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_neon(QRgba64 *buffer, const uchar *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
convertARGB32ToRGBA64PM_neon<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void QT_FASTCALL storeRGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
|
@ -94,6 +94,55 @@ static void convertARGBToARGB32PM_sse4(uint *buffer, const uint *src, int count)
|
||||
}
|
||||
}
|
||||
|
||||
template<bool RGBA>
|
||||
static void convertARGBToRGBA64PM_sse4(QRgba64 *buffer, const uint *src, int count)
|
||||
{
|
||||
int i = 0;
|
||||
const __m128i alphaMask = _mm_set1_epi32(0xff000000);
|
||||
const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
|
||||
const __m128i shuffleMask = _mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
for (; i < count - 3; i += 4) {
|
||||
__m128i srcVector = _mm_loadu_si128((const __m128i *)&src[i]);
|
||||
if (!_mm_testz_si128(srcVector, alphaMask)) {
|
||||
if (!_mm_testc_si128(srcVector, alphaMask)) {
|
||||
if (!RGBA)
|
||||
srcVector = _mm_shuffle_epi8(srcVector, rgbaMask);
|
||||
__m128i src1 = _mm_unpacklo_epi8(srcVector, zero);
|
||||
__m128i src2 = _mm_unpackhi_epi8(srcVector, zero);
|
||||
__m128i alpha1 = _mm_shuffle_epi8(src1, shuffleMask);
|
||||
__m128i alpha2 = _mm_shuffle_epi8(src2, shuffleMask);
|
||||
src1 = _mm_mullo_epi16(src1, alpha1);
|
||||
src2 = _mm_mullo_epi16(src2, alpha2);
|
||||
alpha1 = _mm_unpacklo_epi8(srcVector, srcVector);
|
||||
alpha2 = _mm_unpackhi_epi8(srcVector, srcVector);
|
||||
src1 = _mm_add_epi16(src1, _mm_srli_epi16(src1, 7));
|
||||
src2 = _mm_add_epi16(src2, _mm_srli_epi16(src2, 7));
|
||||
src1 = _mm_blend_epi16(src1, alpha1, 0x88);
|
||||
src2 = _mm_blend_epi16(src2, alpha2, 0x88);
|
||||
_mm_storeu_si128((__m128i *)&buffer[i], src1);
|
||||
_mm_storeu_si128((__m128i *)&buffer[i + 2], src2);
|
||||
} else {
|
||||
if (!RGBA)
|
||||
srcVector = _mm_shuffle_epi8(srcVector, rgbaMask);
|
||||
const __m128i src1 = _mm_unpacklo_epi8(srcVector, srcVector);
|
||||
const __m128i src2 = _mm_unpackhi_epi8(srcVector, srcVector);
|
||||
_mm_storeu_si128((__m128i *)&buffer[i], src1);
|
||||
_mm_storeu_si128((__m128i *)&buffer[i + 2], src2);
|
||||
}
|
||||
} else {
|
||||
_mm_storeu_si128((__m128i *)&buffer[i], zero);
|
||||
_mm_storeu_si128((__m128i *)&buffer[i + 2], zero);
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_EPILOGUE(i, count, 3) {
|
||||
const uint s = RGBA ? RGBA2ARGB(src[i]) : src[i];
|
||||
buffer[i] = QRgba64::fromArgb32(s).premultiplied();
|
||||
}
|
||||
}
|
||||
|
||||
static inline __m128 Q_DECL_VECTORCALL reciprocal_mul_ps(__m128 a, float mul)
|
||||
{
|
||||
__m128 ia = _mm_rcp_ps(a); // Approximate 1/a
|
||||
@ -269,6 +318,20 @@ void QT_FASTCALL convertRGBA8888ToARGB32PM_sse4(uint *buffer, int count, const Q
|
||||
convertARGBToARGB32PM_sse4<true>(buffer, buffer, count);
|
||||
}
|
||||
|
||||
const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_sse4(QRgba64 *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
convertARGBToRGBA64PM_sse4<false>(buffer, src, count);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_sse4(QRgba64 *buffer, const uint *src, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
convertARGBToRGBA64PM_sse4<true>(buffer, src, count);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const uint *QT_FASTCALL fetchARGB32ToARGB32PM_sse4(uint *buffer, const uchar *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
@ -283,6 +346,20 @@ const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_sse4(uint *buffer, const uchar *
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_sse4(QRgba64 *buffer, const uchar *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
convertARGBToRGBA64PM_sse4<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_sse4(QRgba64 *buffer, const uchar *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
convertARGBToRGBA64PM_sse4<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void QT_FASTCALL storeRGB32FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
|
||||
const QVector<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user