Minor improvements in qrgba64_p.h

Adds SIMD acceleration for the blend_pixel, and raw interpolate methods,
and cleans up other SIMD code.

Gives minor speedups in text rendering and various fallbacks.

Change-Id: Ib0ad8b408450e4e73f3c1d50e9caaed0098acb94
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Allan Sandfeld Jensen 2020-08-23 09:20:01 +02:00
parent 53b11882c8
commit d16ac88cb0

View File

@ -1,6 +1,6 @@
/****************************************************************************
**
** Copyright (C) 2016 The Qt Company Ltd.
** Copyright (C) 2020 The Qt Company Ltd.
** Contact: https://www.qt.io/licensing/
**
** This file is part of the QtGui module of the Qt Toolkit.
@ -64,15 +64,7 @@ inline QRgba64 combineAlpha256(QRgba64 rgba64, uint alpha256)
return QRgba64::fromRgba64(rgba64.red(), rgba64.green(), rgba64.blue(), (rgba64.alpha() * alpha256) >> 8);
}
inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
{
return QRgba64::fromRgba64(qt_div_65535(rgba64.red() * alpha65535),
qt_div_65535(rgba64.green() * alpha65535),
qt_div_65535(rgba64.blue() * alpha65535),
qt_div_65535(rgba64.alpha() * alpha65535));
}
#ifdef __SSE2__
#if defined(__SSE2__)
static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, __m128i va)
{
__m128i vs = rgba64;
@ -80,7 +72,7 @@ static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, __m12
vs = _mm_add_epi32(vs, _mm_srli_epi32(vs, 16));
vs = _mm_add_epi32(vs, _mm_set1_epi32(0x8000));
vs = _mm_srai_epi32(vs, 16);
vs = _mm_packs_epi32(vs, _mm_setzero_si128());
vs = _mm_packs_epi32(vs, vs);
return vs;
}
static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, uint alpha65535)
@ -103,6 +95,28 @@ static inline uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535)
}
#endif
static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
{
#if defined(__SSE2__)
const __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgba64));
const __m128i vr = multiplyAlpha65535(v, alpha65535);
QRgba64 r;
_mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr);
return r;
#elif defined(__ARM_NEON__)
const uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
const uint16x4_t vr = multiplyAlpha65535(v, alpha65535);
QRgba64 r;
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
return r;
#else
return QRgba64::fromRgba64(qt_div_65535(rgba64.red() * alpha65535),
qt_div_65535(rgba64.green() * alpha65535),
qt_div_65535(rgba64.blue() * alpha65535),
qt_div_65535(rgba64.alpha() * alpha65535));
#endif
}
template<typename T>
static inline T Q_DECL_VECTORCALL multiplyAlpha255(T rgba64, uint alpha255)
{
@ -116,15 +130,10 @@ static inline T Q_DECL_VECTORCALL multiplyAlpha255(T rgba64, uint alpha255)
#endif
}
inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
{
return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2));
}
#if defined __SSE2__
static inline __m128i Q_DECL_VECTORCALL interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2)
{
return _mm_add_epi32(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2));
return _mm_add_epi16(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2));
}
#endif
@ -135,20 +144,36 @@ inline uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint a
}
#endif
inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
{
return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2));
#if defined(__SSE2__)
const __m128i vx = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&x));
const __m128i vy = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&y));
const __m128i vr = interpolate255(vx, alpha1, vy, alpha2);
QRgba64 r;
_mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr);
return r;
#elif defined(__ARM_NEON__)
const uint16x4_t vx = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&x)));
const uint16x4_t vy = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&y)));
const uint16x4_t vr = interpolate255(vx, alpha1, vy, alpha2);
QRgba64 r;
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
return r;
#else
return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2));
#endif
}
#if defined __SSE2__
static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2)
{
return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
return _mm_add_epi16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
}
// alpha2 below is const-ref because otherwise MSVC2015 complains that it can't 16-byte align the argument.
static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, __m128i alpha1, __m128i y, const __m128i &alpha2)
static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, __m128i alpha1, __m128i y, __m128i alpha2)
{
return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
return _mm_add_epi16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
}
#endif
@ -163,12 +188,42 @@ inline uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y
}
#endif
inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
{
#if defined(__SSE2__)
const __m128i vx = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&x));
const __m128i vy = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&y));
const __m128i vr = interpolate65535(vx, alpha1, vy, alpha2);
QRgba64 r;
_mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr);
return r;
#elif defined(__ARM_NEON__)
const uint16x4_t vx = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&x)));
const uint16x4_t vy = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&y)));
const uint16x4_t vr = interpolate65535(vx, alpha1, vy, alpha2);
QRgba64 r;
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
return r;
#else
return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2));
#endif
}
static inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
{
#if defined(__SSE2__)
const __m128i va = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&a));
const __m128i vb = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&b));
const __m128i vr = _mm_adds_epu16(va, vb);
QRgba64 r;
_mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr);
return r;
#else
return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535),
qMin(a.green() + b.green(), 65535),
qMin(a.blue() + b.blue(), 65535),
qMin(a.alpha() + b.alpha(), 65535));
#endif
}
#if QT_COMPILER_SUPPORTS_HERE(SSE2)
@ -196,7 +251,7 @@ static inline uint toArgb32(uint16x4_t v)
static inline uint toArgb32(QRgba64 rgba64)
{
#if defined __SSE2__
__m128i v = _mm_loadl_epi64((const __m128i *)&rgba64);
__m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgba64));
v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 0, 1, 2));
return toArgb32(v);
#elif defined __ARM_NEON__
@ -216,7 +271,7 @@ static inline uint toArgb32(QRgba64 rgba64)
static inline uint toRgba8888(QRgba64 rgba64)
{
#if defined __SSE2__
__m128i v = _mm_loadl_epi64((const __m128i *)&rgba64);
__m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgba64));
return toArgb32(v);
#elif defined __ARM_NEON__
uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
@ -230,8 +285,8 @@ static inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
{
QRgba64 blend;
#if defined(__SSE2__)
__m128i vd = _mm_loadl_epi64((const __m128i *)&d);
__m128i vs = _mm_loadl_epi64((const __m128i *)&s);
__m128i vd = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&d));
__m128i vs = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&s));
__m128i va = _mm_cvtsi32_si128(rgbAlpha);
va = _mm_unpacklo_epi8(va, va);
va = _mm_shufflelo_epi16(va, _MM_SHUFFLE(3, 0, 1, 2));
@ -243,9 +298,9 @@ static inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
vd = _mm_add_epi32(vd, _mm_srli_epi32(vd, 16));
vd = _mm_add_epi32(vd, _mm_set1_epi32(0x8000));
vd = _mm_srai_epi32(vd, 16);
vd = _mm_packs_epi32(vd, _mm_setzero_si128());
vd = _mm_packs_epi32(vd, vd);
_mm_storel_epi64((__m128i *)&blend, vd);
_mm_storel_epi64(reinterpret_cast<__m128i *>(&blend), vd);
#elif defined(__ARM_NEON__)
uint16x4_t vd = vreinterpret_u16_u64(vmov_n_u64(d));
uint16x4_t vs = vreinterpret_u16_u64(vmov_n_u64(s));
@ -276,8 +331,17 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src)
{
if (src.isOpaque())
dst = src;
else if (!src.isTransparent())
else if (!src.isTransparent()) {
#if defined(__SSE2__)
const __m128i vd = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&dst));
const __m128i vs = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&src));
const __m128i via = _mm_xor_si128(_mm_set1_epi16(-1), _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3)));
const __m128i vr = _mm_add_epi16(vs, multiplyAlpha65535(vd, via));
_mm_storel_epi64(reinterpret_cast<__m128i *>(&dst), vr);
#else
dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
#endif
}
}
static inline void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha)
@ -285,8 +349,17 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha)
if (const_alpha == 255)
return blend_pixel(dst, src);
if (!src.isTransparent()) {
#if defined(__SSE2__)
const __m128i vd = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&dst));
__m128i vs = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&src));
vs = multiplyAlpha255(vs, const_alpha);
const __m128i via = _mm_xor_si128(_mm_set1_epi16(-1), _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3)));
const __m128i vr = _mm_add_epi16(vs, multiplyAlpha65535(vd, via));
_mm_storel_epi64(reinterpret_cast<__m128i *>(&dst), vr);
#else
src = multiplyAlpha255(src, const_alpha);
dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
#endif
}
}