Minor improvements in qrgba64_p.h

Adds SIMD acceleration for the blend_pixel, and raw interpolate methods, and cleans up other SIMD code. Gives minor speedups in text rendering and various fallbacks. Change-Id: Ib0ad8b408450e4e73f3c1d50e9caaed0098acb94 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-08-23 09:20:01 +02:00 · 2020-08-23 09:20:01 +02:00 · d16ac88cb0
commit d16ac88cb0
parent 53b11882c8
1 changed files with 104 additions and 31 deletions
--- a/src/gui/painting/qrgba64_p.h
+++ b/src/gui/painting/qrgba64_p.h
@ -1,6 +1,6 @@
 /****************************************************************************
 **
-** Copyright (C) 2016 The Qt Company Ltd.
+** Copyright (C) 2020 The Qt Company Ltd.
 ** Contact: https://www.qt.io/licensing/
 **
 ** This file is part of the QtGui module of the Qt Toolkit.
@ -64,15 +64,7 @@ inline QRgba64 combineAlpha256(QRgba64 rgba64, uint alpha256)
    return QRgba64::fromRgba64(rgba64.red(), rgba64.green(), rgba64.blue(), (rgba64.alpha() * alpha256) >> 8);
 }

-inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
-{
-    return QRgba64::fromRgba64(qt_div_65535(rgba64.red()   * alpha65535),
-                               qt_div_65535(rgba64.green() * alpha65535),
-                               qt_div_65535(rgba64.blue()  * alpha65535),
-                               qt_div_65535(rgba64.alpha() * alpha65535));
-}
-
-#ifdef __SSE2__
+#if defined(__SSE2__)
 static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, __m128i va)
 {
    __m128i vs = rgba64;
@ -80,7 +72,7 @@ static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, __m12
    vs = _mm_add_epi32(vs, _mm_srli_epi32(vs, 16));
    vs = _mm_add_epi32(vs, _mm_set1_epi32(0x8000));
    vs = _mm_srai_epi32(vs, 16);
-    vs = _mm_packs_epi32(vs, _mm_setzero_si128());
+    vs = _mm_packs_epi32(vs, vs);
    return vs;
 }
 static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, uint alpha65535)
@ -103,6 +95,28 @@ static inline uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535)
 }
 #endif

+static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
+{
+#if defined(__SSE2__)
+    const __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgba64));
+    const __m128i vr = multiplyAlpha65535(v, alpha65535);
+    QRgba64 r;
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr);
+    return r;
+#elif defined(__ARM_NEON__)
+    const uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
+    const uint16x4_t vr = multiplyAlpha65535(v, alpha65535);
+    QRgba64 r;
+    vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
+    return r;
+#else
+    return QRgba64::fromRgba64(qt_div_65535(rgba64.red()   * alpha65535),
+                               qt_div_65535(rgba64.green() * alpha65535),
+                               qt_div_65535(rgba64.blue()  * alpha65535),
+                               qt_div_65535(rgba64.alpha() * alpha65535));
+#endif
+}
+
 template<typename T>
 static inline T Q_DECL_VECTORCALL multiplyAlpha255(T rgba64, uint alpha255)
 {
@ -116,15 +130,10 @@ static inline T Q_DECL_VECTORCALL multiplyAlpha255(T rgba64, uint alpha255)
 #endif
 }

-inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
-{
-    return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2));
-}
-
 #if defined __SSE2__
 static inline __m128i Q_DECL_VECTORCALL interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2)
 {
-    return _mm_add_epi32(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2));
+    return _mm_add_epi16(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2));
 }
 #endif

@ -135,20 +144,36 @@ inline uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint a
 }
 #endif

-inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
+static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
 {
-    return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2));
+#if defined(__SSE2__)
+    const __m128i vx = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&x));
+    const __m128i vy = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&y));
+    const __m128i vr = interpolate255(vx, alpha1, vy, alpha2);
+    QRgba64 r;
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr);
+    return r;
+#elif defined(__ARM_NEON__)
+    const uint16x4_t vx = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&x)));
+    const uint16x4_t vy = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&y)));
+    const uint16x4_t vr = interpolate255(vx, alpha1, vy, alpha2);
+    QRgba64 r;
+    vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
+    return r;
+#else
+    return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2));
+#endif
 }

 #if defined __SSE2__
 static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2)
 {
-    return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
+    return _mm_add_epi16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
 }
-// alpha2 below is const-ref because otherwise MSVC2015 complains that it can't 16-byte align the argument.
-static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, __m128i alpha1, __m128i y, const __m128i &alpha2)
+
+static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, __m128i alpha1, __m128i y, __m128i alpha2)
 {
-    return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
+    return _mm_add_epi16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
 }
 #endif

@ -163,12 +188,42 @@ inline uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y
 }
 #endif

-inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
+static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
 {
+#if defined(__SSE2__)
+    const __m128i vx = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&x));
+    const __m128i vy = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&y));
+    const __m128i vr = interpolate65535(vx, alpha1, vy, alpha2);
+    QRgba64 r;
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr);
+    return r;
+#elif defined(__ARM_NEON__)
+    const uint16x4_t vx = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&x)));
+    const uint16x4_t vy = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&y)));
+    const uint16x4_t vr = interpolate65535(vx, alpha1, vy, alpha2);
+    QRgba64 r;
+    vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
+    return r;
+#else
+    return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2));
+#endif
+}
+
+static inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
+{
+#if defined(__SSE2__)
+    const __m128i va = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&a));
+    const __m128i vb = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&b));
+    const __m128i vr = _mm_adds_epu16(va, vb);
+    QRgba64 r;
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr);
+    return r;
+#else
    return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535),
                               qMin(a.green() + b.green(), 65535),
                               qMin(a.blue() + b.blue(), 65535),
                               qMin(a.alpha() + b.alpha(), 65535));
+#endif
 }

 #if QT_COMPILER_SUPPORTS_HERE(SSE2)
@ -196,7 +251,7 @@ static inline uint toArgb32(uint16x4_t v)
 static inline uint toArgb32(QRgba64 rgba64)
 {
 #if defined __SSE2__
-    __m128i v = _mm_loadl_epi64((const __m128i *)&rgba64);
+    __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgba64));
    v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 0, 1, 2));
    return toArgb32(v);
 #elif defined __ARM_NEON__
@ -216,7 +271,7 @@ static inline uint toArgb32(QRgba64 rgba64)
 static inline uint toRgba8888(QRgba64 rgba64)
 {
 #if defined __SSE2__
-    __m128i v = _mm_loadl_epi64((const __m128i *)&rgba64);
+    __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgba64));
    return toArgb32(v);
 #elif defined __ARM_NEON__
    uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
@ -230,8 +285,8 @@ static inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
 {
    QRgba64 blend;
 #if defined(__SSE2__)
-    __m128i vd = _mm_loadl_epi64((const __m128i *)&d);
-    __m128i vs = _mm_loadl_epi64((const __m128i *)&s);
+    __m128i vd = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&d));
+    __m128i vs = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&s));
    __m128i va =  _mm_cvtsi32_si128(rgbAlpha);
    va = _mm_unpacklo_epi8(va, va);
    va = _mm_shufflelo_epi16(va, _MM_SHUFFLE(3, 0, 1, 2));
@ -243,9 +298,9 @@ static inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
    vd = _mm_add_epi32(vd, _mm_srli_epi32(vd, 16));
    vd = _mm_add_epi32(vd, _mm_set1_epi32(0x8000));
    vd = _mm_srai_epi32(vd, 16);
-    vd = _mm_packs_epi32(vd, _mm_setzero_si128());
+    vd = _mm_packs_epi32(vd, vd);

-    _mm_storel_epi64((__m128i *)&blend, vd);
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&blend), vd);
 #elif defined(__ARM_NEON__)
    uint16x4_t vd = vreinterpret_u16_u64(vmov_n_u64(d));
    uint16x4_t vs = vreinterpret_u16_u64(vmov_n_u64(s));
@ -276,8 +331,17 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src)
 {
    if (src.isOpaque())
        dst = src;
-    else if (!src.isTransparent())
+    else if (!src.isTransparent()) {
+#if defined(__SSE2__)
+        const __m128i vd = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&dst));
+        const __m128i vs = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&src));
+        const __m128i via = _mm_xor_si128(_mm_set1_epi16(-1), _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3)));
+        const __m128i vr = _mm_add_epi16(vs, multiplyAlpha65535(vd, via));
+        _mm_storel_epi64(reinterpret_cast<__m128i *>(&dst), vr);
+#else
        dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
+#endif
+    }
 }

 static inline void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha)
@ -285,8 +349,17 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha)
    if (const_alpha == 255)
        return blend_pixel(dst, src);
    if (!src.isTransparent()) {
+#if defined(__SSE2__)
+        const __m128i vd = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&dst));
+        __m128i vs = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&src));
+        vs = multiplyAlpha255(vs, const_alpha);
+        const __m128i via = _mm_xor_si128(_mm_set1_epi16(-1), _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3)));
+        const __m128i vr = _mm_add_epi16(vs, multiplyAlpha65535(vd, via));
+        _mm_storel_epi64(reinterpret_cast<__m128i *>(&dst), vr);
+#else
        src = multiplyAlpha255(src, const_alpha);
        dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
+#endif
    }
 }