Improve accuracy in fast path bilinear sampling

Adds rounding before using the optimized low accuracy interpolation, this reduces the magnitude of error in the scaled result from ~4 bits to just 2 bits. Change-Id: Ie4e618bf5b1f4a74367aa419ebbd534cc6a846b3 Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
2016-07-06 17:11:59 +02:00 · 2016-07-06 17:11:59 +02:00 · b91f86a212
commit b91f86a212
parent 529b1c9e2a
2 changed files with 62 additions and 39 deletions
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@ -1827,9 +1827,9 @@ static const QRgba64 *QT_FASTCALL fetchTransformed64(QRgba64 *buffer, const Oper

 /** \internal
  interpolate 4 argb pixels with the distx and disty factor.
-  distx and disty bust be between 0 and 16
+  distx and disty must be between 0 and 16
 */
-static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, int distx, int disty)
+static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, uint distx, uint disty)
 {
    uint distxy = distx * disty;
    //idistx * disty = (16-distx) * disty = 16*disty - distxy
@ -2176,7 +2176,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
                const uint *s1 = (const uint *)data->texture.scanLine(y1);
                const uint *s2 = (const uint *)data->texture.scanLine(y2);
-                int disty = (fy & 0x0000ffff) >> 12;
+                int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;

                if (blendType != BlendTransformedBilinearTiled) {
 #define BILINEAR_DOWNSCALE_BOUNDS_PROLOG \
@ -2190,7 +2190,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                        uint tr = s1[x2]; \
                        uint bl = s2[x1]; \
                        uint br = s2[x2]; \
-                        int distx = (fx & 0x0000ffff) >> 12; \
+                        int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; \
                        *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); \
                        fx += fdx; \
                        ++b; \
@ -2209,6 +2209,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                    const __m128i v_256 = _mm_set1_epi16(256);
                    const __m128i v_disty = _mm_set1_epi16(disty);
                    const __m128i v_fdx = _mm_set1_epi32(fdx*4);
+                    const __m128i v_fx_r = _mm_set1_epi32(0x8);
                    __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);

                    while (b < boundedEnd) {
@ -2222,7 +2223,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                        const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]);
                        const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]);

-                        __m128i v_distx = _mm_srli_epi16(v_fx, 12);
+                        __m128i v_distx = _mm_srli_epi16(v_fx, 8);
+                        v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fx_r), 4);
                        v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
                        v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));

@ -2252,6 +2254,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                    }

                    const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
+                    const int32x4_t v_fx_r = vdupq_n_s32(0x0800);

                    while (b < boundedEnd) {

@ -2260,7 +2263,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                        Vect_buffer v_fx_shifted;
                        v_fx_shifted.vect = vshrq_n_s32(v_fx.vect, 16);

-                        int32x4_t v_distx = vshrq_n_s32(vandq_s32(v_fx.vect, v_ffff_mask), 12);
+                        int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx.vect, v_ffff_mask), v_fx_r), 12);

                        for (int i = 0; i < 4; i++) {
                            int x1 = v_fx_shifted.i[i];
@ -2290,7 +2293,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                    uint tr = s1[x2];
                    uint bl = s2[x1];
                    uint br = s2[x2];
-                    int distx = (fx & 0x0000ffff) >> 12;
+                    int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
                    *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
                    fx += fdx;
                    ++b;
@ -2362,6 +2365,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                    const __m128i v_256 = _mm_set1_epi16(256);
                    const __m128i v_fdx = _mm_set1_epi32(fdx*4);
                    const __m128i v_fdy = _mm_set1_epi32(fdy*4);
+                    const __m128i v_fxy_r = _mm_set1_epi32(0x8);
                    __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
                    __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy);

@ -2396,6 +2400,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c

                        __m128i v_distx = _mm_srli_epi16(v_fx, 12);
                        __m128i v_disty = _mm_srli_epi16(v_fy, 12);
+                        v_distx = _mm_srli_epi16(_mm_add_epi32(v_fx, v_fxy_r), 4);
+                        v_disty = _mm_srli_epi16(_mm_add_epi32(v_fy, v_fxy_r), 4);
                        v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
                        v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
                        v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
@ -2434,8 +2440,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                    int disty = (fy & 0x0000ffff) >> 8;
                    *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
 #else
-                    int distx = (fx & 0x0000ffff) >> 12;
-                    int disty = (fy & 0x0000ffff) >> 12;
+                    int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
+                    int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
                    *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
 #endif

@ -2664,13 +2670,13 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
                            fracX += fdx;
                        }
                    } else { //scale down
-                        int disty = (fy & 0x0000ffff) >> 12;
+                        int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
                        for (int i = 0; i < len; ++i) {
                            uint tl = buf1[i * 2 + 0];
                            uint tr = buf1[i * 2 + 1];
                            uint bl = buf2[i * 2 + 0];
                            uint br = buf2[i * 2 + 1];
-                            int distx = (fracX & 0x0000ffff) >> 12;
+                            int distx = ((fracX & 0x0000ffff) + 0x0800) >> 12;
                            b[i] = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
                            fracX += fdx;
                        }
@ -2736,8 +2742,8 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
                        uint bl = buf2[i * 2 + 0];
                        uint br = buf2[i * 2 + 1];

-                        int distx = (fracX & 0x0000ffff) >> 12;
-                        int disty = (fracY & 0x0000ffff) >> 12;
+                        int distx = ((fracX & 0x0000ffff) + 0x0800) >> 12;
+                        int disty = ((fracY & 0x0000ffff) + 0x0800) >> 12;

                        b[i] = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
                        fracX += fdx;
--- a/tests/auto/gui/image/qimage/tst_qimage.cpp
+++ b/tests/auto/gui/image/qimage/tst_qimage.cpp
@ -106,6 +106,7 @@ private slots:
    void smoothScale();
    void smoothScale2_data();
    void smoothScale2();
+    void smoothScale3_data();
    void smoothScale3();
    void smoothScale4();

@ -1715,9 +1716,12 @@ static inline int rand8()
    return int(256. * (qrand() / (RAND_MAX + 1.0)));
 }

-// compares img.scale against the bilinear filtering used by QPainter
-void tst_QImage::smoothScale3()
+void tst_QImage::smoothScale3_data()
 {
+    QTest::addColumn<QImage>("img");
+    QTest::addColumn<qreal>("scale_x");
+    QTest::addColumn<qreal>("scale_y");
+
    QImage img(128, 128, QImage::Format_RGB32);
    for (int y = 0; y < img.height(); ++y) {
        for (int x = 0; x < img.width(); ++x) {
@ -1730,36 +1734,49 @@ void tst_QImage::smoothScale3()
        }
    }

-    qreal scales[2] = { .5, 2 };
+    QTest::newRow("(0.5, 0.5)") << img << qreal(0.5) << qreal(0.5);
+    QTest::newRow("(0.5, 1.0)") << img << qreal(0.5) << qreal(1.0);
+    QTest::newRow("(1.0, 0.5)") << img << qreal(1.0) << qreal(0.5);
+    QTest::newRow("(0.5, 2.0)") << img << qreal(0.5) << qreal(2.0);
+    QTest::newRow("(1.0, 2.0)") << img << qreal(1.0) << qreal(2.0);
+    QTest::newRow("(2.0, 0.5)") << img << qreal(2.0) << qreal(0.5);
+    QTest::newRow("(2.0, 1.0)") << img << qreal(2.0) << qreal(1.0);
+    QTest::newRow("(2.0, 2.0)") << img << qreal(2) << qreal(2);
+}
+// compares img.scale against the bilinear filtering used by QPainter
+void tst_QImage::smoothScale3()
+{
+    QFETCH(QImage, img);
+    QFETCH(qreal, scale_x);
+    QFETCH(qreal, scale_y);

-    for (int i = 0; i < 2; ++i) {
-        QImage a = img.scaled(img.size() * scales[i], Qt::IgnoreAspectRatio, Qt::SmoothTransformation);
-        QImage b(a.size(), a.format());
-        b.fill(0x0);
+    QImage a = img.scaled(img.width() * scale_x, img.height() * scale_y, Qt::IgnoreAspectRatio, Qt::SmoothTransformation);
+    QImage b(a.size(), a.format());
+    b.fill(0x0);

-        QPainter p(&b);
-        p.setRenderHint(QPainter::SmoothPixmapTransform);
-        p.scale(scales[i], scales[i]);
-        p.drawImage(0, 0, img);
-        p.end();
-        int err = 0;
+    QPainter p(&b);
+    p.setRenderHint(QPainter::SmoothPixmapTransform);
+    p.scale(scale_x, scale_y);
+    p.drawImage(0, 0, img);
+    p.end();
+    int err = 0;

-        for (int y = 0; y < a.height(); ++y) {
-            for (int x = 0; x < a.width(); ++x) {
-                QRgb ca = a.pixel(x, y);
-                QRgb cb = b.pixel(x, y);
+    for (int y = 0; y < a.height(); ++y) {
+        for (int x = 0; x < a.width(); ++x) {
+            QRgb ca = a.pixel(x, y);
+            QRgb cb = b.pixel(x, y);

-                // tolerate a little bit of rounding errors
-                bool r = true;
-                r &= qAbs(qRed(ca) - qRed(cb)) <= 18;
-                r &= qAbs(qGreen(ca) - qGreen(cb)) <= 18;
-                r &= qAbs(qBlue(ca) - qBlue(cb)) <= 18;
-                if (!r)
-                    err++;
-            }
+            // tolerate a little bit of rounding errors
+            int tolerance = 3;
+            bool r = true;
+            r &= qAbs(qRed(ca) - qRed(cb)) <= tolerance;
+            r &= qAbs(qGreen(ca) - qGreen(cb)) <= tolerance;
+            r &= qAbs(qBlue(ca) - qBlue(cb)) <= tolerance;
+            if (!r)
+                err++;
        }
-        QCOMPARE(err, 0);
    }
+    QCOMPARE(err, 0);
 }

 // Tests smooth upscale is smooth