Optimize RGBA64 toArgb32

Adds NEON and SSE2 optimized toArgb32 functions.

Change-Id: Icfd280a353bfc2ac5d6983dc37aae5ed03e05ad5
Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
This commit is contained in:
Allan Sandfeld Jensen 2016-11-23 15:25:37 +01:00
parent ce14439ecb
commit 5d35eea3e1
2 changed files with 52 additions and 3 deletions

View File

@ -1313,7 +1313,7 @@ static void QT_FASTCALL destStore(QRasterBuffer *rasterBuffer, int x, int y, con
static void QT_FASTCALL convertFromRgb64(uint *dest, const QRgba64 *src, int length) static void QT_FASTCALL convertFromRgb64(uint *dest, const QRgba64 *src, int length)
{ {
for (int i = 0; i < length; ++i) { for (int i = 0; i < length; ++i) {
dest[i] = src[i].toArgb32(); dest[i] = toArgb32(src[i]);
} }
} }
@ -1404,7 +1404,7 @@ static void QT_FASTCALL destStore64ARGB32(QRasterBuffer *rasterBuffer, int x, in
{ {
uint *dest = (uint*)rasterBuffer->scanLine(y) + x; uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
for (int i = 0; i < length; ++i) { for (int i = 0; i < length; ++i) {
dest[i] = buffer[i].unpremultiplied().toArgb32(); dest[i] = toArgb32(buffer[i].unpremultiplied());
} }
} }
@ -1412,7 +1412,7 @@ static void QT_FASTCALL destStore64RGBA8888(QRasterBuffer *rasterBuffer, int x,
{ {
uint *dest = (uint*)rasterBuffer->scanLine(y) + x; uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
for (int i = 0; i < length; ++i) { for (int i = 0; i < length; ++i) {
dest[i] = ARGB2RGBA(buffer[i].unpremultiplied().toArgb32()); dest[i] = toRgba8888(buffer[i].unpremultiplied());
} }
} }

View File

@ -185,6 +185,55 @@ inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
qMin(a.alpha() + b.alpha(), 65535)); qMin(a.alpha() + b.alpha(), 65535));
} }
#if defined __SSE2__
Q_ALWAYS_INLINE uint toArgb32(__m128i v)
{
v = _mm_unpacklo_epi16(v, _mm_setzero_si128());
v = _mm_add_epi32(v, _mm_set1_epi32(128));
v = _mm_sub_epi32(v, _mm_srli_epi32(v, 8));
v = _mm_srli_epi32(v, 8);
v = _mm_packs_epi32(v, v);
v = _mm_packus_epi16(v, v);
return _mm_cvtsi128_si32(v);
}
#elif defined __ARM_NEON__
Q_ALWAYS_INLINE uint toArgb32(uint16x4_t v)
{
v = vsub_u16(v, vrshr_n_u16(v, 8));
v = vrshr_n_u16(v, 8);
uint8x8_t v8 = vmovn_u16(vcombine_u16(v, v));
return vget_lane_u32(vreinterpret_u32_u8(v8), 0);
}
#endif
inline uint toArgb32(QRgba64 rgba64)
{
#if defined __SSE2__
__m128i v = _mm_loadl_epi64((const __m128i *)&rgba64);
v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 0, 1, 2));
return toArgb32(v);
#elif defined __ARM_NEON__
uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
v = vext_u16(v, v, 1);
return toArgb32(v);
#else
return rgba64.toArgb32();
#endif
}
inline uint toRgba8888(QRgba64 rgba64)
{
#if defined __SSE2__
__m128i v = _mm_loadl_epi64((const __m128i *)&rgba64);
return toArgb32(v);
#elif defined __ARM_NEON__
uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
return toArgb32(v);
#else
return ARGB2RGBA(toArgb32(rgba64));
#endif
}
#if defined(__SSE2__) #if defined(__SSE2__)
Q_ALWAYS_INLINE __m128i addWithSaturation(__m128i a, __m128i b) Q_ALWAYS_INLINE __m128i addWithSaturation(__m128i a, __m128i b)
{ {