Split fetchTransformedBilinear and fetchTransformedBilinear64

Split out basic fetching to share it between the two. Change-Id: I6c27a7cea3a5c10b511232edc68bd32490514a27 Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
2016-11-29 12:20:04 +01:00 · 2016-11-29 12:20:04 +01:00 · 715fd425ef
commit 715fd425ef
parent f1a23a5467
1 changed files with 290 additions and 357 deletions
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@ -2657,6 +2657,241 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
    return buffer;
 }

+template<TextureBlendType blendType, QPixelLayout::BPP bpp>
+static void QT_FASTCALL fetchTransformedBilinear_simple_upscale_helper(uint *b, uint *end, const QTextureData &image,
+                                                                       int &fx, int &fy, int fdx, int /*fdy*/)
+{
+    const QPixelLayout *layout = &qPixelLayouts[image.format];
+    const QVector<QRgb> *clut = image.colorTable;
+    Q_ASSERT(bpp == QPixelLayout::BPPNone || bpp == layout->bpp);
+    // When templated 'fetch' should be inlined at compile time:
+    const FetchPixelsFunc fetch = (bpp == QPixelLayout::BPPNone) ? qFetchPixels[layout->bpp] : fetchPixels<bpp>;
+    const ConvertFunc convertToARGB32PM = layout->convertToARGB32PM;
+
+    int y1 = (fy >> 16);
+    int y2;
+    fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+    const uchar *s1 = image.scanLine(y1);
+    const uchar *s2 = image.scanLine(y2);
+
+    int disty = (fy & 0x0000ffff) >> 8;
+    int idisty = 256 - disty;
+    int x = fx >> 16;
+    int length = end - b;
+
+    // The idea is first to do the interpolation between the row s1 and the row s2
+    // into an intermediate buffer, then we interpolate between two pixel of this buffer.
+    // +1 for the last pixel to interpolate with, and +1 for rounding errors.
+    uint buf1[buffer_size + 2];
+    uint buf2[buffer_size + 2];
+    const uint *ptr1;
+    const uint *ptr2;
+
+    int count = (qint64(length) * fdx + fixed_scale - 1) / fixed_scale + 2;
+    Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case
+
+    if (blendType == BlendTransformedBilinearTiled) {
+        x %= image.width;
+        if (x < 0)
+            x += image.width;
+        int len1 = qMin(count, image.width - x);
+        int len2 = qMin(x, count - len1);
+
+        ptr1 = fetch(buf1, s1, x, len1);
+        ptr1 = convertToARGB32PM(buf1, ptr1, len1, clut, 0);
+        ptr2 = fetch(buf2, s2, x, len1);
+        ptr2 = convertToARGB32PM(buf2, ptr2, len1, clut, 0);
+        for (int i = 0; i < len1; ++i) {
+            uint t = ptr1[i];
+            uint b = ptr2[i];
+            buf1[i] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+            buf2[i] = ((((t >> 8) & 0xff00ff) * idisty + ((b >> 8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+        }
+
+        if (len2) {
+            ptr1 = fetch(buf1 + len1, s1, 0, len2);
+            ptr1 = convertToARGB32PM(buf1 + len1, ptr1, len2, clut, 0);
+            ptr2 = fetch(buf2 + len1, s2, 0, len2);
+            ptr2 = convertToARGB32PM(buf2 + len1, ptr2, len2, clut, 0);
+            for (int i = 0; i < len2; ++i) {
+                uint t = ptr1[i];
+                uint b = ptr2[i];
+                buf1[i + len1] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+                buf2[i + len1] = ((((t >> 8) & 0xff00ff) * idisty + ((b >> 8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+            }
+        }
+        for (int i = image.width; i < count; ++i) {
+            buf1[i] = buf1[i - image.width];
+            buf2[i] = buf2[i - image.width];
+        }
+    } else {
+        int start = qMax(x, image.x1);
+        int end = qMin(x + count, image.x2);
+        int len = qMax(1, end - start);
+        int leading = start - x;
+
+        ptr1 = fetch(buf1 + leading, s1, start, len);
+        ptr1 = convertToARGB32PM(buf1 + leading, ptr1, len, clut, 0);
+        ptr2 = fetch(buf2 + leading, s2, start, len);
+        ptr2 = convertToARGB32PM(buf2 + leading, ptr2, len, clut, 0);
+
+        for (int i = 0; i < len; ++i) {
+            uint t = ptr1[i];
+            uint b = ptr2[i];
+            buf1[i + leading] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+            buf2[i + leading] = ((((t >> 8) & 0xff00ff) * idisty + ((b >> 8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+        }
+
+        for (int i = 0; i < leading; ++i) {
+            buf1[i] = buf1[leading];
+            buf2[i] = buf2[leading];
+        }
+        for (int i = leading + len; i < count; ++i) {
+            buf1[i] = buf1[i - 1];
+            buf2[i] = buf2[i - 1];
+        }
+    }
+
+    // Now interpolate the values from the intermediate_buffer to get the final result.
+    fx &= fixed_scale - 1;
+    Q_ASSERT((fx >> 16) == 0);
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2 = x1 + 1;
+        Q_ASSERT(x1 >= 0);
+        Q_ASSERT(x2 < count);
+
+        int distx = (fx & 0x0000ffff) >> 8;
+        int idistx = 256 - distx;
+        int rb = ((buf1[x1] * idistx + buf1[x2] * distx) >> 8) & 0xff00ff;
+        int ag = (buf2[x1] * idistx + buf2[x2] * distx) & 0xff00ff00;
+        *b++ = rb | ag;
+        fx += fdx;
+    }
+}
+
+
+typedef void (QT_FASTCALL *BilinearFastTransformFetcher)(uint *buf1, uint *buf2, const int len, const QTextureData &image,
+                                                         int fx, int fy, const int fdx, const int fdy);
+
+template<TextureBlendType blendType, QPixelLayout::BPP bpp>
+static void QT_FASTCALL fetchTransformedBilinear_fetcher(uint *buf1, uint *buf2, const int len, const QTextureData &image,
+                                                         int fx, int fy, const int fdx, const int fdy)
+{
+    const QPixelLayout &layout = qPixelLayouts[image.format];
+    Q_ASSERT(bpp == QPixelLayout::BPPNone || bpp == layout.bpp);
+    // When templated 'fetch1' should be inlined at compile time:
+    const FetchPixelFunc fetch1 = (bpp == QPixelLayout::BPPNone) ? qFetchPixel[layout.bpp] : fetchPixel<bpp>;
+    if (fdy == 0) {
+        int y1 = (fy >> 16);
+        int y2;
+        fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+        const uchar *s1 = image.scanLine(y1);
+        const uchar *s2 = image.scanLine(y2);
+
+        int i = 0;
+        if (blendType == BlendTransformedBilinear) {
+            for (; i < len; ++i) {
+                int x1 = (fx >> 16);
+                int x2;
+                fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+                if (x1 != x2)
+                    break;
+                buf1[i * 2 + 0] = buf1[i * 2 + 1] = fetch1(s1, x1);
+                buf2[i * 2 + 0] = buf2[i * 2 + 1] = fetch1(s2, x1);
+                fx += fdx;
+            }
+            int fastLen = len;
+            if (fdx > 0)
+                fastLen = qMin(fastLen, int((qint64(image.x2 - 1) * fixed_scale - fx) / fdx));
+            else if (fdx < 0)
+                fastLen = qMin(fastLen, int((qint64(image.x1) * fixed_scale - fx) / fdx));
+
+            for (; i < fastLen; ++i) {
+                int x = (fx >> 16);
+                buf1[i * 2 + 0] = fetch1(s1, x);
+                buf1[i * 2 + 1] = fetch1(s1, x + 1);
+                buf2[i * 2 + 0] = fetch1(s2, x);
+                buf2[i * 2 + 1] = fetch1(s2, x + 1);
+                fx += fdx;
+            }
+        }
+
+        for (; i < len; ++i) {
+            int x1 = (fx >> 16);
+            int x2;
+            fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+            buf1[i * 2 + 0] = fetch1(s1, x1);
+            buf1[i * 2 + 1] = fetch1(s1, x2);
+            buf2[i * 2 + 0] = fetch1(s2, x1);
+            buf2[i * 2 + 1] = fetch1(s2, x2);
+            fx += fdx;
+        }
+    } else {
+        int i = 0;
+        if (blendType == BlendTransformedBilinear) {
+            for (; i < len; ++i) {
+                int x1 = (fx >> 16);
+                int x2;
+                int y1 = (fy >> 16);
+                int y2;
+                fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+                fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+                if (x1 != x2 && y1 != y2)
+                    break;
+                const uchar *s1 = image.scanLine(y1);
+                const uchar *s2 = image.scanLine(y2);
+                buf1[i * 2 + 0] = fetch1(s1, x1);
+                buf1[i * 2 + 1] = fetch1(s1, x2);
+                buf2[i * 2 + 0] = fetch1(s2, x1);
+                buf2[i * 2 + 1] = fetch1(s2, x2);
+                fx += fdx;
+                fy += fdy;
+            }
+            int fastLen = len;
+            if (fdx > 0)
+                fastLen = qMin(fastLen, int((qint64(image.x2 - 1) * fixed_scale - fx) / fdx));
+            else if (fdx < 0)
+                fastLen = qMin(fastLen, int((qint64(image.x1) * fixed_scale - fx) / fdx));
+            if (fdy > 0)
+                fastLen = qMin(fastLen, int((qint64(image.y2 - 1) * fixed_scale - fy) / fdy));
+            else if (fdy < 0)
+                fastLen = qMin(fastLen, int((qint64(image.y1) * fixed_scale - fy) / fdy));
+
+            for (; i < fastLen; ++i) {
+                int x = (fx >> 16);
+                int y = (fy >> 16);
+                const uchar *s1 = image.scanLine(y);
+                const uchar *s2 = s1 + image.bytesPerLine;
+                buf1[i * 2 + 0] = fetch1(s1, x);
+                buf1[i * 2 + 1] = fetch1(s1, x + 1);
+                buf2[i * 2 + 0] = fetch1(s2, x);
+                buf2[i * 2 + 1] = fetch1(s2, x + 1);
+                fx += fdx;
+                fy += fdy;
+            }
+        }
+
+        for (; i < len; ++i) {
+            int x1 = (fx >> 16);
+            int x2;
+            int y1 = (fy >> 16);
+            int y2;
+            fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+            fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+
+            const uchar *s1 = image.scanLine(y1);
+            const uchar *s2 = image.scanLine(y2);
+            buf1[i * 2 + 0] = fetch1(s1, x1);
+            buf1[i * 2 + 1] = fetch1(s1, x2);
+            buf2[i * 2 + 0] = fetch1(s2, x1);
+            buf2[i * 2 + 1] = fetch1(s2, x2);
+            fx += fdx;
+            fy += fdy;
+        }
+    }
+}
+
 // blendType = BlendTransformedBilinear or BlendTransformedBilinearTiled
 template<TextureBlendType blendType, QPixelLayout::BPP bpp>
 static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Operator *,
@ -2664,19 +2899,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
 {
    const QPixelLayout *layout = &qPixelLayouts[data->texture.format];
    const QVector<QRgb> *clut = data->texture.colorTable;
-    if (bpp != QPixelLayout::BPPNone) // Like this to not ICE on GCC 5.3.1
-        Q_ASSERT(layout->bpp == bpp);
-    // When templated 'fetch' should be inlined at compile time:
-    const FetchPixelsFunc fetch = (bpp == QPixelLayout::BPPNone) ? qFetchPixels[layout->bpp] : FetchPixelsFunc(fetchPixels<bpp>);
-    const FetchPixelFunc fetch1 = (bpp == QPixelLayout::BPPNone) ? qFetchPixel[layout->bpp] : FetchPixelFunc(fetchPixel<bpp>);
-
-    int image_width = data->texture.width;
-    int image_height = data->texture.height;
-
-    int image_x1 = data->texture.x1;
-    int image_y1 = data->texture.y1;
-    int image_x2 = data->texture.x2 - 1;
-    int image_y2 = data->texture.y2 - 1;
+    Q_ASSERT(bpp == QPixelLayout::BPPNone || layout->bpp == bpp);

    const qreal cx = x + qreal(0.5);
    const qreal cy = y + qreal(0.5);
@ -2692,203 +2915,80 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
        fx -= half_point;
        fy -= half_point;

-        if (fdy == 0) { //simple scale, no rotation
-            int y1 = (fy >> 16);
-            int y2;
-            fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
-            const uchar *s1 = data->texture.scanLine(y1);
-            const uchar *s2 = data->texture.scanLine(y2);
-
+        if (fdy == 0) { // simple scale, no rotation or shear
            if (fdx <= fixed_scale && fdx > 0) { // scale up on X
-                int disty = (fy & 0x0000ffff) >> 8;
-                int idisty = 256 - disty;
-                int x = fx >> 16;
-
-                // The idea is first to do the interpolation between the row s1 and the row s2
-                // into an intermediate buffer, then we interpolate between two pixel of this buffer.
-                // +1 for the last pixel to interpolate with, and +1 for rounding errors.
-                uint buf1[buffer_size + 2];
-                uint buf2[buffer_size + 2];
-                const uint *ptr1;
-                const uint *ptr2;
-
-                int count = (qint64(length) * fdx + fixed_scale - 1) / fixed_scale + 2;
-                Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case
-
-                if (blendType == BlendTransformedBilinearTiled) {
-                    x %= image_width;
-                    if (x < 0)
-                        x += image_width;
-                    int len1 = qMin(count, image_width - x);
-                    int len2 = qMin(x, count - len1);
-
-                    ptr1 = fetch(buf1, s1, x, len1);
-                    ptr1 = layout->convertToARGB32PM(buf1, ptr1, len1, clut, 0);
-                    ptr2 = fetch(buf2, s2, x, len1);
-                    ptr2 = layout->convertToARGB32PM(buf2, ptr2, len1, clut, 0);
-                    for (int i = 0; i < len1; ++i) {
-                        uint t = ptr1[i];
-                        uint b = ptr2[i];
-                        buf1[i] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                        buf2[i] = ((((t >> 8) & 0xff00ff) * idisty + ((b >> 8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                    }
-
-                    if (len2) {
-                        ptr1 = fetch(buf1 + len1, s1, 0, len2);
-                        ptr1 = layout->convertToARGB32PM(buf1 + len1, ptr1, len2, clut, 0);
-                        ptr2 = fetch(buf2 + len1, s2, 0, len2);
-                        ptr2 = layout->convertToARGB32PM(buf2 + len1, ptr2, len2, clut, 0);
-                        for (int i = 0; i < len2; ++i) {
-                            uint t = ptr1[i];
-                            uint b = ptr2[i];
-                            buf1[i + len1] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                            buf2[i + len1] = ((((t >> 8) & 0xff00ff) * idisty + ((b >> 8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                        }
-                    }
-                    for (int i = image_width; i < count; ++i) {
-                        buf1[i] = buf1[i - image_width];
-                        buf2[i] = buf2[i - image_width];
-                    }
-                } else {
-                    int start = qMax(x, image_x1);
-                    int end = qMin(x + count, image_x2 + 1);
-                    int len = qMax(1, end - start);
-                    int leading = start - x;
-
-                    ptr1 = fetch(buf1 + leading, s1, start, len);
-                    ptr1 = layout->convertToARGB32PM(buf1 + leading, ptr1, len, clut, 0);
-                    ptr2 = fetch(buf2 + leading, s2, start, len);
-                    ptr2 = layout->convertToARGB32PM(buf2 + leading, ptr2, len, clut, 0);
-
-                    for (int i = 0; i < len; ++i) {
-                        uint t = ptr1[i];
-                        uint b = ptr2[i];
-                        buf1[i + leading] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                        buf2[i + leading] = ((((t >> 8) & 0xff00ff) * idisty + ((b >> 8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                    }
-
-                    for (int i = 0; i < leading; ++i) {
-                        buf1[i] = buf1[leading];
-                        buf2[i] = buf2[leading];
-                    }
-                    for (int i = leading + len; i < count; ++i) {
-                        buf1[i] = buf1[i - 1];
-                        buf2[i] = buf2[i - 1];
-                    }
-                }
-
-                // Now interpolate the values from the intermediate_buffer to get the final result.
-                fx &= fixed_scale - 1;
-                Q_ASSERT((fx >> 16) == 0);
-                for (int i = 0; i < length; ++i) {
-                    int x1 = (fx >> 16);
-                    int x2 = x1 + 1;
-                    Q_ASSERT(x1 >= 0);
-                    Q_ASSERT(x2 < count);
-
-                    int distx = (fx & 0x0000ffff) >> 8;
-                    int idistx = 256 - distx;
-                    int rb = ((buf1[x1] * idistx + buf1[x2] * distx) >> 8) & 0xff00ff;
-                    int ag = (buf2[x1] * idistx + buf2[x2] * distx) & 0xff00ff00;
-                    buffer[i] = rb | ag;
-                    fx += fdx;
-                }
+                fetchTransformedBilinear_simple_upscale_helper<blendType, bpp>(buffer, buffer + length, data->texture, fx, fy, fdx, fdy);
            } else {
+                const BilinearFastTransformFetcher fetcher = fetchTransformedBilinear_fetcher<blendType,bpp>;
+
                uint buf1[buffer_size];
                uint buf2[buffer_size];
                uint *b = buffer;
                while (length) {
                    int len = qMin(length, buffer_size / 2);
-                    int fracX = fx;
-                    for (int i = 0; i < len; ++i) {
-                        int x1 = (fx >> 16);
-                        int x2;
-                        fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                        buf1[i * 2 + 0] = fetch1(s1, x1);
-                        buf1[i * 2 + 1] = fetch1(s1, x2);
-                        buf2[i * 2 + 0] = fetch1(s2, x1);
-                        buf2[i * 2 + 1] = fetch1(s2, x2);
-                        fx += fdx;
-                    }
+                    fetcher(buf1, buf2, len, data->texture, fx, fy, fdx, 0);
                    layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0);
                    layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0);

                    if ((fdx < 0 && fdx > -(fixed_scale / 8)) || qAbs(data->m22) < qreal(1./8.)) { // scale up more than 8x
                        int disty = (fy & 0x0000ffff) >> 8;
                        for (int i = 0; i < len; ++i) {
-                            int distx = (fracX & 0x0000ffff) >> 8;
+                            int distx = (fx & 0x0000ffff) >> 8;
                            b[i] = interpolate_4_pixels(buf1 + i * 2, buf2 + i * 2, distx, disty);
-                            fracX += fdx;
+                            fx += fdx;
                        }
-                    } else { //scale down
+                    } else {
                        int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
                        for (int i = 0; i < len; ++i) {
                            uint tl = buf1[i * 2 + 0];
                            uint tr = buf1[i * 2 + 1];
                            uint bl = buf2[i * 2 + 0];
                            uint br = buf2[i * 2 + 1];
-                            int distx = ((fracX & 0x0000ffff) + 0x0800) >> 12;
+                            int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
                            b[i] = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
-                            fracX += fdx;
+                            fx += fdx;
                        }
                    }
                    length -= len;
                    b += len;
                }
            }
-        } else { //rotation
+        } else { // rotation or shear
+            const BilinearFastTransformFetcher fetcher = fetchTransformedBilinear_fetcher<blendType,bpp>;
+
            uint buf1[buffer_size];
            uint buf2[buffer_size];
            uint *b = buffer;
-
            while (length) {
                int len = qMin(length, buffer_size / 2);
-                int fracX = fx;
-                int fracY = fy;
-                for (int i = 0; i < len; ++i) {
-                    int x1 = (fx >> 16);
-                    int x2;
-                    int y1 = (fy >> 16);
-                    int y2;
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
-
-                    const uchar *s1 = data->texture.scanLine(y1);
-                    const uchar *s2 = data->texture.scanLine(y2);
-                    buf1[i * 2 + 0] = fetch1(s1, x1);
-                    buf1[i * 2 + 1] = fetch1(s1, x2);
-                    buf2[i * 2 + 0] = fetch1(s2, x1);
-                    buf2[i * 2 + 1] = fetch1(s2, x2);
-                    fx += fdx;
-                    fy += fdy;
-                }
+                fetcher(buf1, buf2, len, data->texture, fx, fy, fdx, fdy);
                layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0);
                layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0);

-                if (qAbs(data->m11) < qreal(1./8.) || qAbs(data->m22) < qreal(1./8.) ) {
-                    //if we are zooming more than 8 times, we use 8bit precision for the position.
+                if (qAbs(data->m11) < qreal(1./8.)|| qAbs(data->m22) < qreal(1./8.)) {
+                    // If we are zooming more than 8 times, we use 8bit precision for the position.
                    for (int i = 0; i < len; ++i) {
-                        int distx = (fracX & 0x0000ffff) >> 8;
-                        int disty = (fracY & 0x0000ffff) >> 8;
+                        int distx = (fx & 0x0000ffff) >> 8;
+                        int disty = (fy & 0x0000ffff) >> 8;

                        b[i] = interpolate_4_pixels(buf1 + i * 2, buf2 + i * 2, distx, disty);
-                        fracX += fdx;
-                        fracY += fdy;
+                        fx += fdx;
+                        fy += fdy;
                    }
                } else {
-                    //we are zooming less than 8x, use 4bit precision
+                    // We are zooming less than 8x, use 4bit precision
                    for (int i = 0; i < len; ++i) {
                        uint tl = buf1[i * 2 + 0];
                        uint tr = buf1[i * 2 + 1];
                        uint bl = buf2[i * 2 + 0];
                        uint br = buf2[i * 2 + 1];

-                        int distx = ((fracX & 0x0000ffff) + 0x0800) >> 12;
-                        int disty = ((fracY & 0x0000ffff) + 0x0800) >> 12;
+                        int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
+                        int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;

                        b[i] = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
-                        fracX += fdx;
-                        fracY += fdy;
+                        fx += fdx;
+                        fy += fdy;
                    }
                }

@ -2897,6 +2997,11 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
            }
        }
    } else {
+        // When templated 'fetch' should be inlined at compile time:
+        const FetchPixelFunc fetch1 = (bpp == QPixelLayout::BPPNone) ? qFetchPixel[layout->bpp] : fetchPixel<bpp>;
+
+        const QTextureData &image = data->texture;
+
        const qreal fdx = data->m11;
        const qreal fdy = data->m12;
        const qreal fdw = data->m13;
@ -2927,8 +3032,8 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
                distxs[i] = int((px - x1) * 256);
                distys[i] = int((py - y1) * 256);

-                fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+                fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+                fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);

                const uchar *s1 = data->texture.scanLine(y1);
                const uchar *s2 = data->texture.scanLine(y2);
@ -2969,21 +3074,9 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
    const QPixelLayout *layout = &qPixelLayouts[data->texture.format];
    const QVector<QRgb> *clut = data->texture.colorTable;

-    int image_width = data->texture.width;
-    int image_height = data->texture.height;
-
-    int image_x1 = data->texture.x1;
-    int image_y1 = data->texture.y1;
-    int image_x2 = data->texture.x2 - 1;
-    int image_y2 = data->texture.y2 - 1;
-
    const qreal cx = x + qreal(0.5);
    const qreal cy = y + qreal(0.5);

-    const qreal fdx = data->m11;
-    const qreal fdy = data->m12;
-    const qreal fdw = data->m13;
-
    if (data->fast_matrix) {
        // The increment pr x in the scanline
        int fdx = (int)(data->m11 * fixed_scale);
@ -2995,14 +3088,13 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
        fx -= half_point;
        fy -= half_point;

-        if (fdy == 0) { //simple scale, no rotation
-            int y1 = (fy >> 16);
-            int y2;
-            fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
-            const uchar *s1 = data->texture.scanLine(y1);
-            const uchar *s2 = data->texture.scanLine(y2);
+        const BilinearFastTransformFetcher fetcher =
+                (layout->bpp == QPixelLayout::BPP32)
+                        ? fetchTransformedBilinear_fetcher<blendType, QPixelLayout::BPP32>
+                        : fetchTransformedBilinear_fetcher<blendType, QPixelLayout::BPPNone>;
+
+        if (fdy == 0) { //simple scale, no rotation

-            FetchPixelFunc fetch = qFetchPixel[layout->bpp];
            uint sbuf1[buffer_size];
            uint sbuf2[buffer_size];
            quint64 buf1[buffer_size];
@ -3010,84 +3102,19 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
            QRgba64 *b = buffer;
            while (length) {
                int len = qMin(length, buffer_size / 2);
-                int fracX = fx;
-                int i = 0;
                int disty = (fy & 0x0000ffff);
 #if defined(__SSE2__)
                const __m128i vdy = _mm_set1_epi16(disty);
                const __m128i vidy = _mm_set1_epi16(0x10000 - disty);
-                if (blendType != BlendTransformedBilinearTiled && layout->bpp == QPixelLayout::BPP32) {
-                    for (; i < len; ++i) {
-                        int x1 = (fx >> 16);
-                        int x2;
-                        fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                        if (x1 != x2)
-                            break;
-                        sbuf1[i * 2 + 0] = sbuf1[i * 2 + 1] = ((const uint*)s1)[x1];
-                        sbuf2[i * 2 + 0] = sbuf2[i * 2 + 1] = ((const uint*)s2)[x1];
-                        fx += fdx;
-                    }
-                    int fastLen;
-                    if (fdx > 0)
-                        fastLen = qMin(len, int((image_x2 - (fx >> 16)) / data->m11));
-                    else
-                        fastLen = qMin(len, int((image_x1 - (fx >> 16)) / data->m11));
-                    fastLen -= 3;
-
-                    const __m128i v_fdx = _mm_set1_epi32(fdx*4);
-                    __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
-                    for (; i < fastLen; i += 4) {
-                        int offset = _mm_extract_epi16(v_fx, 1);
-                        sbuf1[i * 2 + 0] = ((const uint*)s1)[offset];
-                        sbuf1[i * 2 + 1] = ((const uint*)s1)[offset + 1];
-                        sbuf2[i * 2 + 0] = ((const uint*)s2)[offset];
-                        sbuf2[i * 2 + 1] = ((const uint*)s2)[offset + 1];
-                        offset = _mm_extract_epi16(v_fx, 3);
-                        sbuf1[i * 2 + 2] = ((const uint*)s1)[offset];
-                        sbuf1[i * 2 + 3] = ((const uint*)s1)[offset + 1];
-                        sbuf2[i * 2 + 2] = ((const uint*)s2)[offset];
-                        sbuf2[i * 2 + 3] = ((const uint*)s2)[offset + 1];
-                        offset = _mm_extract_epi16(v_fx, 5);
-                        sbuf1[i * 2 + 4] = ((const uint*)s1)[offset];
-                        sbuf1[i * 2 + 5] = ((const uint*)s1)[offset + 1];
-                        sbuf2[i * 2 + 4] = ((const uint*)s2)[offset];
-                        sbuf2[i * 2 + 5] = ((const uint*)s2)[offset + 1];
-                        offset = _mm_extract_epi16(v_fx, 7);
-                        sbuf1[i * 2 + 6] = ((const uint*)s1)[offset];
-                        sbuf1[i * 2 + 7] = ((const uint*)s1)[offset + 1];
-                        sbuf2[i * 2 + 6] = ((const uint*)s2)[offset];
-                        sbuf2[i * 2 + 7] = ((const uint*)s2)[offset + 1];
-                        v_fx = _mm_add_epi32(v_fx, v_fdx);
-                    }
-                    fx = _mm_cvtsi128_si32(v_fx);
-                }
 #endif
-                for (; i < len; ++i) {
-                    int x1 = (fx >> 16);
-                    int x2;
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
+                fetcher(sbuf1, sbuf2, len, data->texture, fx, fy, fdx, fdy);

-                    if (layout->bpp == QPixelLayout::BPP32) {
-                        sbuf1[i * 2 + 0] = ((const uint*)s1)[x1];
-                        sbuf1[i * 2 + 1] = ((const uint*)s1)[x2];
-                        sbuf2[i * 2 + 0] = ((const uint*)s2)[x1];
-                        sbuf2[i * 2 + 1] = ((const uint*)s2)[x2];
-
-                    } else {
-                        sbuf1[i * 2 + 0] = fetch(s1, x1);
-                        sbuf1[i * 2 + 1] = fetch(s1, x2);
-                        sbuf2[i * 2 + 0] = fetch(s2, x1);
-                        sbuf2[i * 2 + 1] = fetch(s2, x2);
-                    }
-
-                    fx += fdx;
-                }
                layout->convertToARGB64PM((QRgba64 *)buf1, sbuf1, len * 2, clut, 0);
                if (disty)
                    layout->convertToARGB64PM((QRgba64 *)buf2, sbuf2, len * 2, clut, 0);

                for (int i = 0; i < len; ++i) {
-                    int distx = (fracX & 0x0000ffff);
+                    int distx = (fx & 0x0000ffff);
 #if defined(__SSE2__)
                    const __m128i vdistx = _mm_shufflelo_epi16(_mm_cvtsi32_si128(distx), _MM_SHUFFLE(0, 0, 0, 0));
                    const __m128i vidistx = _mm_shufflelo_epi16(_mm_cvtsi32_si128(0x10000 - distx), _MM_SHUFFLE(0, 0, 0, 0));
@ -3104,13 +3131,12 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
 #else
                    b[i] = interpolate_4_pixels_rgb64((QRgba64 *)buf1 + i*2, (QRgba64 *)buf2 + i*2, distx, disty);
 #endif
-                    fracX += fdx;
+                    fx += fdx;
                }
                length -= len;
                b += len;
            }
        } else { //rotation
-            FetchPixelFunc fetch = qFetchPixel[layout->bpp];
            uint sbuf1[buffer_size];
            uint sbuf2[buffer_size];
            quint64 buf1[buffer_size];
@ -3120,117 +3146,18 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co

            while (b < end) {
                int len = qMin(length, buffer_size / 2);
-                int fracX = fx;
-                int fracY = fy;
-                int i = 0;
-#if defined(__SSE2__)
-                if (blendType != BlendTransformedBilinearTiled && layout->bpp == QPixelLayout::BPP32) {
-                    for (; i < len; ++i) {
-                        int x1 = (fx >> 16);
-                        int x2;
-                        int y1 = (fy >> 16);
-                        int y2;
-                        fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                        fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
-                        if (x1 != x2 && y1 != y2)
-                            break;
-                        const uchar *s1 = data->texture.scanLine(y1);
-                        const uchar *s2 = data->texture.scanLine(y2);
-                        sbuf1[i * 2 + 0] = ((const uint*)s1)[x1];
-                        sbuf1[i * 2 + 1] = ((const uint*)s1)[x2];
-                        sbuf2[i * 2 + 0] = ((const uint*)s2)[x1];
-                        sbuf2[i * 2 + 1] = ((const uint*)s2)[x2];
-                        fx += fdx;
-                        fy += fdy;
-                    }
-                    int fastLen = len;
-                    if (fdx > 0)
-                        fastLen = qMin(fastLen, int((qint64(image_x2) * fixed_scale - fx) / fdx));
-                    else if (fdx < 0)
-                        fastLen = qMin(fastLen, int((qint64(image_x1) * fixed_scale - fx) / fdx));
-                    if (fdy > 0)
-                        fastLen = qMin(fastLen, int((qint64(image_y2) * fixed_scale - fy) / fdy));
-                    else if (fdy < 0)
-                        fastLen = qMin(fastLen, int((qint64(image_y1) * fixed_scale - fy) / fdy));
-                    fastLen -= 3;

-                    const __m128i v_fdx = _mm_set1_epi32(fdx*4);
-                    const __m128i v_fdy = _mm_set1_epi32(fdy*4);
-                    __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
-                    __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy);
-                    const int bytesPerLine = data->texture.bytesPerLine;
-                    const uchar *s1 = data->texture.imageData;
-                    const uchar *s2 = s1 + bytesPerLine;
-                    const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0));
-                    for (; i < fastLen; i += 4) {
-                        const __m128i vy = _mm_packs_epi32(_mm_srai_epi32(v_fy, 16), _mm_setzero_si128());
-                        __m128i voffset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epu16(vy, vbpl));
-                        voffset = _mm_add_epi32(voffset, _mm_srli_epi32(v_fx, 16));
+                fetcher(sbuf1, sbuf2, len, data->texture, fx, fy, fdx, fdy);

-                        int offset = _mm_cvtsi128_si32(voffset); voffset = _mm_srli_si128(voffset, 4);
-                        sbuf1[i * 2 + 0] = ((const uint*)s1)[offset];
-                        sbuf1[i * 2 + 1] = ((const uint*)s1)[offset + 1];
-                        sbuf2[i * 2 + 0] = ((const uint*)s2)[offset];
-                        sbuf2[i * 2 + 1] = ((const uint*)s2)[offset + 1];
-                        offset = _mm_cvtsi128_si32(voffset); voffset = _mm_srli_si128(voffset, 4);
-                        sbuf1[i * 2 + 2] = ((const uint*)s1)[offset];
-                        sbuf1[i * 2 + 3] = ((const uint*)s1)[offset + 1];
-                        sbuf2[i * 2 + 2] = ((const uint*)s2)[offset];
-                        sbuf2[i * 2 + 3] = ((const uint*)s2)[offset + 1];
-                        offset = _mm_cvtsi128_si32(voffset); voffset = _mm_srli_si128(voffset, 4);
-                        sbuf1[i * 2 + 4] = ((const uint*)s1)[offset];
-                        sbuf1[i * 2 + 5] = ((const uint*)s1)[offset + 1];
-                        sbuf2[i * 2 + 4] = ((const uint*)s2)[offset];
-                        sbuf2[i * 2 + 5] = ((const uint*)s2)[offset + 1];
-                        offset = _mm_cvtsi128_si32(voffset);
-                        sbuf1[i * 2 + 6] = ((const uint*)s1)[offset];
-                        sbuf1[i * 2 + 7] = ((const uint*)s1)[offset + 1];
-                        sbuf2[i * 2 + 6] = ((const uint*)s2)[offset];
-                        sbuf2[i * 2 + 7] = ((const uint*)s2)[offset + 1];
-
-                        v_fx = _mm_add_epi32(v_fx, v_fdx);
-                        v_fy = _mm_add_epi32(v_fy, v_fdy);
-                    }
-                    fx = _mm_cvtsi128_si32(v_fx);
-                    fy = _mm_cvtsi128_si32(v_fy);
-                }
-#endif
-                for (; i < len; ++i) {
-                    int x1 = (fx >> 16);
-                    int x2;
-                    int y1 = (fy >> 16);
-                    int y2;
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
-
-                    const uchar *s1 = data->texture.scanLine(y1);
-                    const uchar *s2 = data->texture.scanLine(y2);
-
-                    if (layout->bpp == QPixelLayout::BPP32) {
-                        sbuf1[i * 2 + 0] = ((const uint*)s1)[x1];
-                        sbuf1[i * 2 + 1] = ((const uint*)s1)[x2];
-                        sbuf2[i * 2 + 0] = ((const uint*)s2)[x1];
-                        sbuf2[i * 2 + 1] = ((const uint*)s2)[x2];
-
-                    } else {
-                        sbuf1[i * 2 + 0] = fetch(s1, x1);
-                        sbuf1[i * 2 + 1] = fetch(s1, x2);
-                        sbuf2[i * 2 + 0] = fetch(s2, x1);
-                        sbuf2[i * 2 + 1] = fetch(s2, x2);
-                    }
-
-                    fx += fdx;
-                    fy += fdy;
-                }
                layout->convertToARGB64PM((QRgba64 *)buf1, sbuf1, len * 2, clut, 0);
                layout->convertToARGB64PM((QRgba64 *)buf2, sbuf2, len * 2, clut, 0);

                for (int i = 0; i < len; ++i) {
-                    int distx = (fracX & 0x0000ffff);
-                    int disty = (fracY & 0x0000ffff);
+                    int distx = (fx & 0x0000ffff);
+                    int disty = (fy & 0x0000ffff);
                    b[i] = interpolate_4_pixels_rgb64((QRgba64 *)buf1 + i*2, (QRgba64 *)buf2 + i*2, distx, disty);
-                    fracX += fdx;
-                    fracY += fdy;
+                    fx += fdx;
+                    fy += fdy;
                }

                length -= len;
@ -3238,6 +3165,12 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
            }
        }
    } else {
+        const QTextureData &image = data->texture;
+
+        const qreal fdx = data->m11;
+        const qreal fdy = data->m12;
+        const qreal fdw = data->m13;
+
        qreal fx = data->m21 * cy + data->m11 * cx + data->dx;
        qreal fy = data->m22 * cy + data->m12 * cx + data->dy;
        qreal fw = data->m23 * cy + data->m13 * cx + data->m33;
@ -3267,8 +3200,8 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
                distxs[i] = int((px - x1) * (1<<16));
                distys[i] = int((py - y1) * (1<<16));

-                fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+                fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+                fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);

                const uchar *s1 = data->texture.scanLine(y1);
                const uchar *s2 = data->texture.scanLine(y2);