From b91f86a2128093ad7c65fa30b63ef87a9e55a4e0 Mon Sep 17 00:00:00 2001
From: Allan Sandfeld Jensen <allan.jensen@theqtcompany.com>
Date: Wed, 6 Jul 2016 17:11:59 +0200
Subject: [PATCH] Improve accuracy in fast path bilinear sampling

Adds rounding before using the optimized low accuracy interpolation,
this reduces the magnitude of error in the scaled result from ~4 bits
to just 2 bits.

Change-Id: Ie4e618bf5b1f4a74367aa419ebbd534cc6a846b3
Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
---
 src/gui/painting/qdrawhelper.cpp           | 32 ++++++----
 tests/auto/gui/image/qimage/tst_qimage.cpp | 69 ++++++++++++++--------
 2 files changed, 62 insertions(+), 39 deletions(-)
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index b452019251..f7b81944c5 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -1827,9 +1827,9 @@ static const QRgba64 *QT_FASTCALL fetchTransformed64(QRgba64 *buffer, const Oper
 
 /** \internal
   interpolate 4 argb pixels with the distx and disty factor.
-  distx and disty bust be between 0 and 16
+  distx and disty must be between 0 and 16
  */
-static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, int distx, int disty)
+static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, uint distx, uint disty)
 {
     uint distxy = distx * disty;
     //idistx * disty = (16-distx) * disty = 16*disty - distxy
@@ -2176,7 +2176,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                 fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
                 const uint *s1 = (const uint *)data->texture.scanLine(y1);
                 const uint *s2 = (const uint *)data->texture.scanLine(y2);
-                int disty = (fy & 0x0000ffff) >> 12;
+                int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
 
                 if (blendType != BlendTransformedBilinearTiled) {
 #define BILINEAR_DOWNSCALE_BOUNDS_PROLOG \
@@ -2190,7 +2190,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                         uint tr = s1[x2]; \
                         uint bl = s2[x1]; \
                         uint br = s2[x2]; \
-                        int distx = (fx & 0x0000ffff) >> 12; \
+                        int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; \
                         *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); \
                         fx += fdx; \
                         ++b; \
@@ -2209,6 +2209,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                     const __m128i v_256 = _mm_set1_epi16(256);
                     const __m128i v_disty = _mm_set1_epi16(disty);
                     const __m128i v_fdx = _mm_set1_epi32(fdx*4);
+                    const __m128i v_fx_r = _mm_set1_epi32(0x8);
                     __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
 
                     while (b < boundedEnd) {
@@ -2222,7 +2223,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                         const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]);
                         const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]);
 
-                        __m128i v_distx = _mm_srli_epi16(v_fx, 12);
+                        __m128i v_distx = _mm_srli_epi16(v_fx, 8);
+                        v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fx_r), 4);
                         v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
                         v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
 
@@ -2252,6 +2254,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                     }
 
                     const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
+                    const int32x4_t v_fx_r = vdupq_n_s32(0x0800);
 
                     while (b < boundedEnd) {
 
@@ -2260,7 +2263,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                         Vect_buffer v_fx_shifted;
                         v_fx_shifted.vect = vshrq_n_s32(v_fx.vect, 16);
 
-                        int32x4_t v_distx = vshrq_n_s32(vandq_s32(v_fx.vect, v_ffff_mask), 12);
+                        int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx.vect, v_ffff_mask), v_fx_r), 12);
 
                         for (int i = 0; i < 4; i++) {
                             int x1 = v_fx_shifted.i[i];
@@ -2290,7 +2293,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                     uint tr = s1[x2];
                     uint bl = s2[x1];
                     uint br = s2[x2];
-                    int distx = (fx & 0x0000ffff) >> 12;
+                    int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
                     *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
                     fx += fdx;
                     ++b;
@@ -2362,6 +2365,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                     const __m128i v_256 = _mm_set1_epi16(256);
                     const __m128i v_fdx = _mm_set1_epi32(fdx*4);
                     const __m128i v_fdy = _mm_set1_epi32(fdy*4);
+                    const __m128i v_fxy_r = _mm_set1_epi32(0x8);
                     __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
                     __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy);
 
@@ -2396,6 +2400,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
 
                         __m128i v_distx = _mm_srli_epi16(v_fx, 12);
                         __m128i v_disty = _mm_srli_epi16(v_fy, 12);
+                        v_distx = _mm_srli_epi16(_mm_add_epi32(v_fx, v_fxy_r), 4);
+                        v_disty = _mm_srli_epi16(_mm_add_epi32(v_fy, v_fxy_r), 4);
                         v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
                         v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
                         v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
@@ -2434,8 +2440,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                     int disty = (fy & 0x0000ffff) >> 8;
                     *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
 #else
-                    int distx = (fx & 0x0000ffff) >> 12;
-                    int disty = (fy & 0x0000ffff) >> 12;
+                    int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
+                    int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
                     *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
 #endif
 
@@ -2664,13 +2670,13 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
                             fracX += fdx;
                         }
                     } else { //scale down
-                        int disty = (fy & 0x0000ffff) >> 12;
+                        int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
                         for (int i = 0; i < len; ++i) {
                             uint tl = buf1[i * 2 + 0];
                             uint tr = buf1[i * 2 + 1];
                             uint bl = buf2[i * 2 + 0];
                             uint br = buf2[i * 2 + 1];
-                            int distx = (fracX & 0x0000ffff) >> 12;
+                            int distx = ((fracX & 0x0000ffff) + 0x0800) >> 12;
                             b[i] = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
                             fracX += fdx;
                         }
@@ -2736,8 +2742,8 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
                         uint bl = buf2[i * 2 + 0];
                         uint br = buf2[i * 2 + 1];
 
-                        int distx = (fracX & 0x0000ffff) >> 12;
-                        int disty = (fracY & 0x0000ffff) >> 12;
+                        int distx = ((fracX & 0x0000ffff) + 0x0800) >> 12;
+                        int disty = ((fracY & 0x0000ffff) + 0x0800) >> 12;
 
                         b[i] = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
                         fracX += fdx;
diff --git a/tests/auto/gui/image/qimage/tst_qimage.cpp b/tests/auto/gui/image/qimage/tst_qimage.cpp
index 1bc4ec2ae7..7e1a02d716 100644
--- a/tests/auto/gui/image/qimage/tst_qimage.cpp
+++ b/tests/auto/gui/image/qimage/tst_qimage.cpp
@@ -106,6 +106,7 @@ private slots:
     void smoothScale();
     void smoothScale2_data();
     void smoothScale2();
+    void smoothScale3_data();
     void smoothScale3();
     void smoothScale4();
 
@@ -1715,9 +1716,12 @@ static inline int rand8()
     return int(256. * (qrand() / (RAND_MAX + 1.0)));
 }
 
-// compares img.scale against the bilinear filtering used by QPainter
-void tst_QImage::smoothScale3()
+void tst_QImage::smoothScale3_data()
 {
+    QTest::addColumn<QImage>("img");
+    QTest::addColumn<qreal>("scale_x");
+    QTest::addColumn<qreal>("scale_y");
+
     QImage img(128, 128, QImage::Format_RGB32);
     for (int y = 0; y < img.height(); ++y) {
         for (int x = 0; x < img.width(); ++x) {
@@ -1730,36 +1734,49 @@ void tst_QImage::smoothScale3()
         }
     }
 
-    qreal scales[2] = { .5, 2 };
+    QTest::newRow("(0.5, 0.5)") << img << qreal(0.5) << qreal(0.5);
+    QTest::newRow("(0.5, 1.0)") << img << qreal(0.5) << qreal(1.0);
+    QTest::newRow("(1.0, 0.5)") << img << qreal(1.0) << qreal(0.5);
+    QTest::newRow("(0.5, 2.0)") << img << qreal(0.5) << qreal(2.0);
+    QTest::newRow("(1.0, 2.0)") << img << qreal(1.0) << qreal(2.0);
+    QTest::newRow("(2.0, 0.5)") << img << qreal(2.0) << qreal(0.5);
+    QTest::newRow("(2.0, 1.0)") << img << qreal(2.0) << qreal(1.0);
+    QTest::newRow("(2.0, 2.0)") << img << qreal(2) << qreal(2);
+}
+// compares img.scale against the bilinear filtering used by QPainter
+void tst_QImage::smoothScale3()
+{
+    QFETCH(QImage, img);
+    QFETCH(qreal, scale_x);
+    QFETCH(qreal, scale_y);
 
-    for (int i = 0; i < 2; ++i) {
-        QImage a = img.scaled(img.size() * scales[i], Qt::IgnoreAspectRatio, Qt::SmoothTransformation);
-        QImage b(a.size(), a.format());
-        b.fill(0x0);
+    QImage a = img.scaled(img.width() * scale_x, img.height() * scale_y, Qt::IgnoreAspectRatio, Qt::SmoothTransformation);
+    QImage b(a.size(), a.format());
+    b.fill(0x0);
 
-        QPainter p(&b);
-        p.setRenderHint(QPainter::SmoothPixmapTransform);
-        p.scale(scales[i], scales[i]);
-        p.drawImage(0, 0, img);
-        p.end();
-        int err = 0;
+    QPainter p(&b);
+    p.setRenderHint(QPainter::SmoothPixmapTransform);
+    p.scale(scale_x, scale_y);
+    p.drawImage(0, 0, img);
+    p.end();
+    int err = 0;
 
-        for (int y = 0; y < a.height(); ++y) {
-            for (int x = 0; x < a.width(); ++x) {
-                QRgb ca = a.pixel(x, y);
-                QRgb cb = b.pixel(x, y);
+    for (int y = 0; y < a.height(); ++y) {
+        for (int x = 0; x < a.width(); ++x) {
+            QRgb ca = a.pixel(x, y);
+            QRgb cb = b.pixel(x, y);
 
-                // tolerate a little bit of rounding errors
-                bool r = true;
-                r &= qAbs(qRed(ca) - qRed(cb)) <= 18;
-                r &= qAbs(qGreen(ca) - qGreen(cb)) <= 18;
-                r &= qAbs(qBlue(ca) - qBlue(cb)) <= 18;
-                if (!r)
-                    err++;
-            }
+            // tolerate a little bit of rounding errors
+            int tolerance = 3;
+            bool r = true;
+            r &= qAbs(qRed(ca) - qRed(cb)) <= tolerance;
+            r &= qAbs(qGreen(ca) - qGreen(cb)) <= tolerance;
+            r &= qAbs(qBlue(ca) - qBlue(cb)) <= tolerance;
+            if (!r)
+                err++;
         }
-        QCOMPARE(err, 0);
     }
+    QCOMPARE(err, 0);
 }
 
 // Tests smooth upscale is smooth