Split fetchTransformedBilinear and fetchTransformedBilinear64

Split out basic fetching to share it between the two.

Change-Id: I6c27a7cea3a5c10b511232edc68bd32490514a27
Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
This commit is contained in:
Allan Sandfeld Jensen 2016-11-29 12:20:04 +01:00
parent f1a23a5467
commit 715fd425ef

View File

@ -2657,6 +2657,241 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
return buffer;
}
template<TextureBlendType blendType, QPixelLayout::BPP bpp>
static void QT_FASTCALL fetchTransformedBilinear_simple_upscale_helper(uint *b, uint *end, const QTextureData &image,
int &fx, int &fy, int fdx, int /*fdy*/)
{
const QPixelLayout *layout = &qPixelLayouts[image.format];
const QVector<QRgb> *clut = image.colorTable;
Q_ASSERT(bpp == QPixelLayout::BPPNone || bpp == layout->bpp);
// When templated 'fetch' should be inlined at compile time:
const FetchPixelsFunc fetch = (bpp == QPixelLayout::BPPNone) ? qFetchPixels[layout->bpp] : fetchPixels<bpp>;
const ConvertFunc convertToARGB32PM = layout->convertToARGB32PM;
int y1 = (fy >> 16);
int y2;
fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
const uchar *s1 = image.scanLine(y1);
const uchar *s2 = image.scanLine(y2);
int disty = (fy & 0x0000ffff) >> 8;
int idisty = 256 - disty;
int x = fx >> 16;
int length = end - b;
// The idea is first to do the interpolation between the row s1 and the row s2
// into an intermediate buffer, then we interpolate between two pixel of this buffer.
// +1 for the last pixel to interpolate with, and +1 for rounding errors.
uint buf1[buffer_size + 2];
uint buf2[buffer_size + 2];
const uint *ptr1;
const uint *ptr2;
int count = (qint64(length) * fdx + fixed_scale - 1) / fixed_scale + 2;
Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case
if (blendType == BlendTransformedBilinearTiled) {
x %= image.width;
if (x < 0)
x += image.width;
int len1 = qMin(count, image.width - x);
int len2 = qMin(x, count - len1);
ptr1 = fetch(buf1, s1, x, len1);
ptr1 = convertToARGB32PM(buf1, ptr1, len1, clut, 0);
ptr2 = fetch(buf2, s2, x, len1);
ptr2 = convertToARGB32PM(buf2, ptr2, len1, clut, 0);
for (int i = 0; i < len1; ++i) {
uint t = ptr1[i];
uint b = ptr2[i];
buf1[i] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
buf2[i] = ((((t >> 8) & 0xff00ff) * idisty + ((b >> 8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
}
if (len2) {
ptr1 = fetch(buf1 + len1, s1, 0, len2);
ptr1 = convertToARGB32PM(buf1 + len1, ptr1, len2, clut, 0);
ptr2 = fetch(buf2 + len1, s2, 0, len2);
ptr2 = convertToARGB32PM(buf2 + len1, ptr2, len2, clut, 0);
for (int i = 0; i < len2; ++i) {
uint t = ptr1[i];
uint b = ptr2[i];
buf1[i + len1] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
buf2[i + len1] = ((((t >> 8) & 0xff00ff) * idisty + ((b >> 8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
}
}
for (int i = image.width; i < count; ++i) {
buf1[i] = buf1[i - image.width];
buf2[i] = buf2[i - image.width];
}
} else {
int start = qMax(x, image.x1);
int end = qMin(x + count, image.x2);
int len = qMax(1, end - start);
int leading = start - x;
ptr1 = fetch(buf1 + leading, s1, start, len);
ptr1 = convertToARGB32PM(buf1 + leading, ptr1, len, clut, 0);
ptr2 = fetch(buf2 + leading, s2, start, len);
ptr2 = convertToARGB32PM(buf2 + leading, ptr2, len, clut, 0);
for (int i = 0; i < len; ++i) {
uint t = ptr1[i];
uint b = ptr2[i];
buf1[i + leading] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
buf2[i + leading] = ((((t >> 8) & 0xff00ff) * idisty + ((b >> 8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
}
for (int i = 0; i < leading; ++i) {
buf1[i] = buf1[leading];
buf2[i] = buf2[leading];
}
for (int i = leading + len; i < count; ++i) {
buf1[i] = buf1[i - 1];
buf2[i] = buf2[i - 1];
}
}
// Now interpolate the values from the intermediate_buffer to get the final result.
fx &= fixed_scale - 1;
Q_ASSERT((fx >> 16) == 0);
while (b < end) {
int x1 = (fx >> 16);
int x2 = x1 + 1;
Q_ASSERT(x1 >= 0);
Q_ASSERT(x2 < count);
int distx = (fx & 0x0000ffff) >> 8;
int idistx = 256 - distx;
int rb = ((buf1[x1] * idistx + buf1[x2] * distx) >> 8) & 0xff00ff;
int ag = (buf2[x1] * idistx + buf2[x2] * distx) & 0xff00ff00;
*b++ = rb | ag;
fx += fdx;
}
}
typedef void (QT_FASTCALL *BilinearFastTransformFetcher)(uint *buf1, uint *buf2, const int len, const QTextureData &image,
int fx, int fy, const int fdx, const int fdy);
template<TextureBlendType blendType, QPixelLayout::BPP bpp>
static void QT_FASTCALL fetchTransformedBilinear_fetcher(uint *buf1, uint *buf2, const int len, const QTextureData &image,
int fx, int fy, const int fdx, const int fdy)
{
const QPixelLayout &layout = qPixelLayouts[image.format];
Q_ASSERT(bpp == QPixelLayout::BPPNone || bpp == layout.bpp);
// When templated 'fetch1' should be inlined at compile time:
const FetchPixelFunc fetch1 = (bpp == QPixelLayout::BPPNone) ? qFetchPixel[layout.bpp] : fetchPixel<bpp>;
if (fdy == 0) {
int y1 = (fy >> 16);
int y2;
fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
const uchar *s1 = image.scanLine(y1);
const uchar *s2 = image.scanLine(y2);
int i = 0;
if (blendType == BlendTransformedBilinear) {
for (; i < len; ++i) {
int x1 = (fx >> 16);
int x2;
fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
if (x1 != x2)
break;
buf1[i * 2 + 0] = buf1[i * 2 + 1] = fetch1(s1, x1);
buf2[i * 2 + 0] = buf2[i * 2 + 1] = fetch1(s2, x1);
fx += fdx;
}
int fastLen = len;
if (fdx > 0)
fastLen = qMin(fastLen, int((qint64(image.x2 - 1) * fixed_scale - fx) / fdx));
else if (fdx < 0)
fastLen = qMin(fastLen, int((qint64(image.x1) * fixed_scale - fx) / fdx));
for (; i < fastLen; ++i) {
int x = (fx >> 16);
buf1[i * 2 + 0] = fetch1(s1, x);
buf1[i * 2 + 1] = fetch1(s1, x + 1);
buf2[i * 2 + 0] = fetch1(s2, x);
buf2[i * 2 + 1] = fetch1(s2, x + 1);
fx += fdx;
}
}
for (; i < len; ++i) {
int x1 = (fx >> 16);
int x2;
fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
buf1[i * 2 + 0] = fetch1(s1, x1);
buf1[i * 2 + 1] = fetch1(s1, x2);
buf2[i * 2 + 0] = fetch1(s2, x1);
buf2[i * 2 + 1] = fetch1(s2, x2);
fx += fdx;
}
} else {
int i = 0;
if (blendType == BlendTransformedBilinear) {
for (; i < len; ++i) {
int x1 = (fx >> 16);
int x2;
int y1 = (fy >> 16);
int y2;
fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
if (x1 != x2 && y1 != y2)
break;
const uchar *s1 = image.scanLine(y1);
const uchar *s2 = image.scanLine(y2);
buf1[i * 2 + 0] = fetch1(s1, x1);
buf1[i * 2 + 1] = fetch1(s1, x2);
buf2[i * 2 + 0] = fetch1(s2, x1);
buf2[i * 2 + 1] = fetch1(s2, x2);
fx += fdx;
fy += fdy;
}
int fastLen = len;
if (fdx > 0)
fastLen = qMin(fastLen, int((qint64(image.x2 - 1) * fixed_scale - fx) / fdx));
else if (fdx < 0)
fastLen = qMin(fastLen, int((qint64(image.x1) * fixed_scale - fx) / fdx));
if (fdy > 0)
fastLen = qMin(fastLen, int((qint64(image.y2 - 1) * fixed_scale - fy) / fdy));
else if (fdy < 0)
fastLen = qMin(fastLen, int((qint64(image.y1) * fixed_scale - fy) / fdy));
for (; i < fastLen; ++i) {
int x = (fx >> 16);
int y = (fy >> 16);
const uchar *s1 = image.scanLine(y);
const uchar *s2 = s1 + image.bytesPerLine;
buf1[i * 2 + 0] = fetch1(s1, x);
buf1[i * 2 + 1] = fetch1(s1, x + 1);
buf2[i * 2 + 0] = fetch1(s2, x);
buf2[i * 2 + 1] = fetch1(s2, x + 1);
fx += fdx;
fy += fdy;
}
}
for (; i < len; ++i) {
int x1 = (fx >> 16);
int x2;
int y1 = (fy >> 16);
int y2;
fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
const uchar *s1 = image.scanLine(y1);
const uchar *s2 = image.scanLine(y2);
buf1[i * 2 + 0] = fetch1(s1, x1);
buf1[i * 2 + 1] = fetch1(s1, x2);
buf2[i * 2 + 0] = fetch1(s2, x1);
buf2[i * 2 + 1] = fetch1(s2, x2);
fx += fdx;
fy += fdy;
}
}
}
// blendType = BlendTransformedBilinear or BlendTransformedBilinearTiled
template<TextureBlendType blendType, QPixelLayout::BPP bpp>
static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Operator *,
@ -2664,19 +2899,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
{
const QPixelLayout *layout = &qPixelLayouts[data->texture.format];
const QVector<QRgb> *clut = data->texture.colorTable;
if (bpp != QPixelLayout::BPPNone) // Like this to not ICE on GCC 5.3.1
Q_ASSERT(layout->bpp == bpp);
// When templated 'fetch' should be inlined at compile time:
const FetchPixelsFunc fetch = (bpp == QPixelLayout::BPPNone) ? qFetchPixels[layout->bpp] : FetchPixelsFunc(fetchPixels<bpp>);
const FetchPixelFunc fetch1 = (bpp == QPixelLayout::BPPNone) ? qFetchPixel[layout->bpp] : FetchPixelFunc(fetchPixel<bpp>);
int image_width = data->texture.width;
int image_height = data->texture.height;
int image_x1 = data->texture.x1;
int image_y1 = data->texture.y1;
int image_x2 = data->texture.x2 - 1;
int image_y2 = data->texture.y2 - 1;
Q_ASSERT(bpp == QPixelLayout::BPPNone || layout->bpp == bpp);
const qreal cx = x + qreal(0.5);
const qreal cy = y + qreal(0.5);
@ -2692,203 +2915,80 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
fx -= half_point;
fy -= half_point;
if (fdy == 0) { //simple scale, no rotation
int y1 = (fy >> 16);
int y2;
fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
const uchar *s1 = data->texture.scanLine(y1);
const uchar *s2 = data->texture.scanLine(y2);
if (fdy == 0) { // simple scale, no rotation or shear
if (fdx <= fixed_scale && fdx > 0) { // scale up on X
int disty = (fy & 0x0000ffff) >> 8;
int idisty = 256 - disty;
int x = fx >> 16;
// The idea is first to do the interpolation between the row s1 and the row s2
// into an intermediate buffer, then we interpolate between two pixel of this buffer.
// +1 for the last pixel to interpolate with, and +1 for rounding errors.
uint buf1[buffer_size + 2];
uint buf2[buffer_size + 2];
const uint *ptr1;
const uint *ptr2;
int count = (qint64(length) * fdx + fixed_scale - 1) / fixed_scale + 2;
Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case
if (blendType == BlendTransformedBilinearTiled) {
x %= image_width;
if (x < 0)
x += image_width;
int len1 = qMin(count, image_width - x);
int len2 = qMin(x, count - len1);
ptr1 = fetch(buf1, s1, x, len1);
ptr1 = layout->convertToARGB32PM(buf1, ptr1, len1, clut, 0);
ptr2 = fetch(buf2, s2, x, len1);
ptr2 = layout->convertToARGB32PM(buf2, ptr2, len1, clut, 0);
for (int i = 0; i < len1; ++i) {
uint t = ptr1[i];
uint b = ptr2[i];
buf1[i] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
buf2[i] = ((((t >> 8) & 0xff00ff) * idisty + ((b >> 8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
}
if (len2) {
ptr1 = fetch(buf1 + len1, s1, 0, len2);
ptr1 = layout->convertToARGB32PM(buf1 + len1, ptr1, len2, clut, 0);
ptr2 = fetch(buf2 + len1, s2, 0, len2);
ptr2 = layout->convertToARGB32PM(buf2 + len1, ptr2, len2, clut, 0);
for (int i = 0; i < len2; ++i) {
uint t = ptr1[i];
uint b = ptr2[i];
buf1[i + len1] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
buf2[i + len1] = ((((t >> 8) & 0xff00ff) * idisty + ((b >> 8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
}
}
for (int i = image_width; i < count; ++i) {
buf1[i] = buf1[i - image_width];
buf2[i] = buf2[i - image_width];
}
} else {
int start = qMax(x, image_x1);
int end = qMin(x + count, image_x2 + 1);
int len = qMax(1, end - start);
int leading = start - x;
ptr1 = fetch(buf1 + leading, s1, start, len);
ptr1 = layout->convertToARGB32PM(buf1 + leading, ptr1, len, clut, 0);
ptr2 = fetch(buf2 + leading, s2, start, len);
ptr2 = layout->convertToARGB32PM(buf2 + leading, ptr2, len, clut, 0);
for (int i = 0; i < len; ++i) {
uint t = ptr1[i];
uint b = ptr2[i];
buf1[i + leading] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
buf2[i + leading] = ((((t >> 8) & 0xff00ff) * idisty + ((b >> 8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
}
for (int i = 0; i < leading; ++i) {
buf1[i] = buf1[leading];
buf2[i] = buf2[leading];
}
for (int i = leading + len; i < count; ++i) {
buf1[i] = buf1[i - 1];
buf2[i] = buf2[i - 1];
}
}
// Now interpolate the values from the intermediate_buffer to get the final result.
fx &= fixed_scale - 1;
Q_ASSERT((fx >> 16) == 0);
for (int i = 0; i < length; ++i) {
int x1 = (fx >> 16);
int x2 = x1 + 1;
Q_ASSERT(x1 >= 0);
Q_ASSERT(x2 < count);
int distx = (fx & 0x0000ffff) >> 8;
int idistx = 256 - distx;
int rb = ((buf1[x1] * idistx + buf1[x2] * distx) >> 8) & 0xff00ff;
int ag = (buf2[x1] * idistx + buf2[x2] * distx) & 0xff00ff00;
buffer[i] = rb | ag;
fx += fdx;
}
fetchTransformedBilinear_simple_upscale_helper<blendType, bpp>(buffer, buffer + length, data->texture, fx, fy, fdx, fdy);
} else {
const BilinearFastTransformFetcher fetcher = fetchTransformedBilinear_fetcher<blendType,bpp>;
uint buf1[buffer_size];
uint buf2[buffer_size];
uint *b = buffer;
while (length) {
int len = qMin(length, buffer_size / 2);
int fracX = fx;
for (int i = 0; i < len; ++i) {
int x1 = (fx >> 16);
int x2;
fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
buf1[i * 2 + 0] = fetch1(s1, x1);
buf1[i * 2 + 1] = fetch1(s1, x2);
buf2[i * 2 + 0] = fetch1(s2, x1);
buf2[i * 2 + 1] = fetch1(s2, x2);
fx += fdx;
}
fetcher(buf1, buf2, len, data->texture, fx, fy, fdx, 0);
layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0);
layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0);
if ((fdx < 0 && fdx > -(fixed_scale / 8)) || qAbs(data->m22) < qreal(1./8.)) { // scale up more than 8x
int disty = (fy & 0x0000ffff) >> 8;
for (int i = 0; i < len; ++i) {
int distx = (fracX & 0x0000ffff) >> 8;
int distx = (fx & 0x0000ffff) >> 8;
b[i] = interpolate_4_pixels(buf1 + i * 2, buf2 + i * 2, distx, disty);
fracX += fdx;
fx += fdx;
}
} else { //scale down
} else {
int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
for (int i = 0; i < len; ++i) {
uint tl = buf1[i * 2 + 0];
uint tr = buf1[i * 2 + 1];
uint bl = buf2[i * 2 + 0];
uint br = buf2[i * 2 + 1];
int distx = ((fracX & 0x0000ffff) + 0x0800) >> 12;
int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
b[i] = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
fracX += fdx;
fx += fdx;
}
}
length -= len;
b += len;
}
}
} else { //rotation
} else { // rotation or shear
const BilinearFastTransformFetcher fetcher = fetchTransformedBilinear_fetcher<blendType,bpp>;
uint buf1[buffer_size];
uint buf2[buffer_size];
uint *b = buffer;
while (length) {
int len = qMin(length, buffer_size / 2);
int fracX = fx;
int fracY = fy;
for (int i = 0; i < len; ++i) {
int x1 = (fx >> 16);
int x2;
int y1 = (fy >> 16);
int y2;
fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
const uchar *s1 = data->texture.scanLine(y1);
const uchar *s2 = data->texture.scanLine(y2);
buf1[i * 2 + 0] = fetch1(s1, x1);
buf1[i * 2 + 1] = fetch1(s1, x2);
buf2[i * 2 + 0] = fetch1(s2, x1);
buf2[i * 2 + 1] = fetch1(s2, x2);
fx += fdx;
fy += fdy;
}
fetcher(buf1, buf2, len, data->texture, fx, fy, fdx, fdy);
layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0);
layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0);
if (qAbs(data->m11) < qreal(1./8.) || qAbs(data->m22) < qreal(1./8.) ) {
//if we are zooming more than 8 times, we use 8bit precision for the position.
if (qAbs(data->m11) < qreal(1./8.)|| qAbs(data->m22) < qreal(1./8.)) {
// If we are zooming more than 8 times, we use 8bit precision for the position.
for (int i = 0; i < len; ++i) {
int distx = (fracX & 0x0000ffff) >> 8;
int disty = (fracY & 0x0000ffff) >> 8;
int distx = (fx & 0x0000ffff) >> 8;
int disty = (fy & 0x0000ffff) >> 8;
b[i] = interpolate_4_pixels(buf1 + i * 2, buf2 + i * 2, distx, disty);
fracX += fdx;
fracY += fdy;
fx += fdx;
fy += fdy;
}
} else {
//we are zooming less than 8x, use 4bit precision
// We are zooming less than 8x, use 4bit precision
for (int i = 0; i < len; ++i) {
uint tl = buf1[i * 2 + 0];
uint tr = buf1[i * 2 + 1];
uint bl = buf2[i * 2 + 0];
uint br = buf2[i * 2 + 1];
int distx = ((fracX & 0x0000ffff) + 0x0800) >> 12;
int disty = ((fracY & 0x0000ffff) + 0x0800) >> 12;
int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
b[i] = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
fracX += fdx;
fracY += fdy;
fx += fdx;
fy += fdy;
}
}
@ -2897,6 +2997,11 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
}
}
} else {
// When templated 'fetch' should be inlined at compile time:
const FetchPixelFunc fetch1 = (bpp == QPixelLayout::BPPNone) ? qFetchPixel[layout->bpp] : fetchPixel<bpp>;
const QTextureData &image = data->texture;
const qreal fdx = data->m11;
const qreal fdy = data->m12;
const qreal fdw = data->m13;
@ -2927,8 +3032,8 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
distxs[i] = int((px - x1) * 256);
distys[i] = int((py - y1) * 256);
fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
const uchar *s1 = data->texture.scanLine(y1);
const uchar *s2 = data->texture.scanLine(y2);
@ -2969,21 +3074,9 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
const QPixelLayout *layout = &qPixelLayouts[data->texture.format];
const QVector<QRgb> *clut = data->texture.colorTable;
int image_width = data->texture.width;
int image_height = data->texture.height;
int image_x1 = data->texture.x1;
int image_y1 = data->texture.y1;
int image_x2 = data->texture.x2 - 1;
int image_y2 = data->texture.y2 - 1;
const qreal cx = x + qreal(0.5);
const qreal cy = y + qreal(0.5);
const qreal fdx = data->m11;
const qreal fdy = data->m12;
const qreal fdw = data->m13;
if (data->fast_matrix) {
// The increment pr x in the scanline
int fdx = (int)(data->m11 * fixed_scale);
@ -2995,14 +3088,13 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
fx -= half_point;
fy -= half_point;
if (fdy == 0) { //simple scale, no rotation
int y1 = (fy >> 16);
int y2;
fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
const uchar *s1 = data->texture.scanLine(y1);
const uchar *s2 = data->texture.scanLine(y2);
const BilinearFastTransformFetcher fetcher =
(layout->bpp == QPixelLayout::BPP32)
? fetchTransformedBilinear_fetcher<blendType, QPixelLayout::BPP32>
: fetchTransformedBilinear_fetcher<blendType, QPixelLayout::BPPNone>;
if (fdy == 0) { //simple scale, no rotation
FetchPixelFunc fetch = qFetchPixel[layout->bpp];
uint sbuf1[buffer_size];
uint sbuf2[buffer_size];
quint64 buf1[buffer_size];
@ -3010,84 +3102,19 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
QRgba64 *b = buffer;
while (length) {
int len = qMin(length, buffer_size / 2);
int fracX = fx;
int i = 0;
int disty = (fy & 0x0000ffff);
#if defined(__SSE2__)
const __m128i vdy = _mm_set1_epi16(disty);
const __m128i vidy = _mm_set1_epi16(0x10000 - disty);
if (blendType != BlendTransformedBilinearTiled && layout->bpp == QPixelLayout::BPP32) {
for (; i < len; ++i) {
int x1 = (fx >> 16);
int x2;
fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
if (x1 != x2)
break;
sbuf1[i * 2 + 0] = sbuf1[i * 2 + 1] = ((const uint*)s1)[x1];
sbuf2[i * 2 + 0] = sbuf2[i * 2 + 1] = ((const uint*)s2)[x1];
fx += fdx;
}
int fastLen;
if (fdx > 0)
fastLen = qMin(len, int((image_x2 - (fx >> 16)) / data->m11));
else
fastLen = qMin(len, int((image_x1 - (fx >> 16)) / data->m11));
fastLen -= 3;
const __m128i v_fdx = _mm_set1_epi32(fdx*4);
__m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
for (; i < fastLen; i += 4) {
int offset = _mm_extract_epi16(v_fx, 1);
sbuf1[i * 2 + 0] = ((const uint*)s1)[offset];
sbuf1[i * 2 + 1] = ((const uint*)s1)[offset + 1];
sbuf2[i * 2 + 0] = ((const uint*)s2)[offset];
sbuf2[i * 2 + 1] = ((const uint*)s2)[offset + 1];
offset = _mm_extract_epi16(v_fx, 3);
sbuf1[i * 2 + 2] = ((const uint*)s1)[offset];
sbuf1[i * 2 + 3] = ((const uint*)s1)[offset + 1];
sbuf2[i * 2 + 2] = ((const uint*)s2)[offset];
sbuf2[i * 2 + 3] = ((const uint*)s2)[offset + 1];
offset = _mm_extract_epi16(v_fx, 5);
sbuf1[i * 2 + 4] = ((const uint*)s1)[offset];
sbuf1[i * 2 + 5] = ((const uint*)s1)[offset + 1];
sbuf2[i * 2 + 4] = ((const uint*)s2)[offset];
sbuf2[i * 2 + 5] = ((const uint*)s2)[offset + 1];
offset = _mm_extract_epi16(v_fx, 7);
sbuf1[i * 2 + 6] = ((const uint*)s1)[offset];
sbuf1[i * 2 + 7] = ((const uint*)s1)[offset + 1];
sbuf2[i * 2 + 6] = ((const uint*)s2)[offset];
sbuf2[i * 2 + 7] = ((const uint*)s2)[offset + 1];
v_fx = _mm_add_epi32(v_fx, v_fdx);
}
fx = _mm_cvtsi128_si32(v_fx);
}
#endif
for (; i < len; ++i) {
int x1 = (fx >> 16);
int x2;
fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
fetcher(sbuf1, sbuf2, len, data->texture, fx, fy, fdx, fdy);
if (layout->bpp == QPixelLayout::BPP32) {
sbuf1[i * 2 + 0] = ((const uint*)s1)[x1];
sbuf1[i * 2 + 1] = ((const uint*)s1)[x2];
sbuf2[i * 2 + 0] = ((const uint*)s2)[x1];
sbuf2[i * 2 + 1] = ((const uint*)s2)[x2];
} else {
sbuf1[i * 2 + 0] = fetch(s1, x1);
sbuf1[i * 2 + 1] = fetch(s1, x2);
sbuf2[i * 2 + 0] = fetch(s2, x1);
sbuf2[i * 2 + 1] = fetch(s2, x2);
}
fx += fdx;
}
layout->convertToARGB64PM((QRgba64 *)buf1, sbuf1, len * 2, clut, 0);
if (disty)
layout->convertToARGB64PM((QRgba64 *)buf2, sbuf2, len * 2, clut, 0);
for (int i = 0; i < len; ++i) {
int distx = (fracX & 0x0000ffff);
int distx = (fx & 0x0000ffff);
#if defined(__SSE2__)
const __m128i vdistx = _mm_shufflelo_epi16(_mm_cvtsi32_si128(distx), _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vidistx = _mm_shufflelo_epi16(_mm_cvtsi32_si128(0x10000 - distx), _MM_SHUFFLE(0, 0, 0, 0));
@ -3104,13 +3131,12 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
#else
b[i] = interpolate_4_pixels_rgb64((QRgba64 *)buf1 + i*2, (QRgba64 *)buf2 + i*2, distx, disty);
#endif
fracX += fdx;
fx += fdx;
}
length -= len;
b += len;
}
} else { //rotation
FetchPixelFunc fetch = qFetchPixel[layout->bpp];
uint sbuf1[buffer_size];
uint sbuf2[buffer_size];
quint64 buf1[buffer_size];
@ -3120,117 +3146,18 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
while (b < end) {
int len = qMin(length, buffer_size / 2);
int fracX = fx;
int fracY = fy;
int i = 0;
#if defined(__SSE2__)
if (blendType != BlendTransformedBilinearTiled && layout->bpp == QPixelLayout::BPP32) {
for (; i < len; ++i) {
int x1 = (fx >> 16);
int x2;
int y1 = (fy >> 16);
int y2;
fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
if (x1 != x2 && y1 != y2)
break;
const uchar *s1 = data->texture.scanLine(y1);
const uchar *s2 = data->texture.scanLine(y2);
sbuf1[i * 2 + 0] = ((const uint*)s1)[x1];
sbuf1[i * 2 + 1] = ((const uint*)s1)[x2];
sbuf2[i * 2 + 0] = ((const uint*)s2)[x1];
sbuf2[i * 2 + 1] = ((const uint*)s2)[x2];
fx += fdx;
fy += fdy;
}
int fastLen = len;
if (fdx > 0)
fastLen = qMin(fastLen, int((qint64(image_x2) * fixed_scale - fx) / fdx));
else if (fdx < 0)
fastLen = qMin(fastLen, int((qint64(image_x1) * fixed_scale - fx) / fdx));
if (fdy > 0)
fastLen = qMin(fastLen, int((qint64(image_y2) * fixed_scale - fy) / fdy));
else if (fdy < 0)
fastLen = qMin(fastLen, int((qint64(image_y1) * fixed_scale - fy) / fdy));
fastLen -= 3;
const __m128i v_fdx = _mm_set1_epi32(fdx*4);
const __m128i v_fdy = _mm_set1_epi32(fdy*4);
__m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
__m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy);
const int bytesPerLine = data->texture.bytesPerLine;
const uchar *s1 = data->texture.imageData;
const uchar *s2 = s1 + bytesPerLine;
const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0));
for (; i < fastLen; i += 4) {
const __m128i vy = _mm_packs_epi32(_mm_srai_epi32(v_fy, 16), _mm_setzero_si128());
__m128i voffset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epu16(vy, vbpl));
voffset = _mm_add_epi32(voffset, _mm_srli_epi32(v_fx, 16));
fetcher(sbuf1, sbuf2, len, data->texture, fx, fy, fdx, fdy);
int offset = _mm_cvtsi128_si32(voffset); voffset = _mm_srli_si128(voffset, 4);
sbuf1[i * 2 + 0] = ((const uint*)s1)[offset];
sbuf1[i * 2 + 1] = ((const uint*)s1)[offset + 1];
sbuf2[i * 2 + 0] = ((const uint*)s2)[offset];
sbuf2[i * 2 + 1] = ((const uint*)s2)[offset + 1];
offset = _mm_cvtsi128_si32(voffset); voffset = _mm_srli_si128(voffset, 4);
sbuf1[i * 2 + 2] = ((const uint*)s1)[offset];
sbuf1[i * 2 + 3] = ((const uint*)s1)[offset + 1];
sbuf2[i * 2 + 2] = ((const uint*)s2)[offset];
sbuf2[i * 2 + 3] = ((const uint*)s2)[offset + 1];
offset = _mm_cvtsi128_si32(voffset); voffset = _mm_srli_si128(voffset, 4);
sbuf1[i * 2 + 4] = ((const uint*)s1)[offset];
sbuf1[i * 2 + 5] = ((const uint*)s1)[offset + 1];
sbuf2[i * 2 + 4] = ((const uint*)s2)[offset];
sbuf2[i * 2 + 5] = ((const uint*)s2)[offset + 1];
offset = _mm_cvtsi128_si32(voffset);
sbuf1[i * 2 + 6] = ((const uint*)s1)[offset];
sbuf1[i * 2 + 7] = ((const uint*)s1)[offset + 1];
sbuf2[i * 2 + 6] = ((const uint*)s2)[offset];
sbuf2[i * 2 + 7] = ((const uint*)s2)[offset + 1];
v_fx = _mm_add_epi32(v_fx, v_fdx);
v_fy = _mm_add_epi32(v_fy, v_fdy);
}
fx = _mm_cvtsi128_si32(v_fx);
fy = _mm_cvtsi128_si32(v_fy);
}
#endif
for (; i < len; ++i) {
int x1 = (fx >> 16);
int x2;
int y1 = (fy >> 16);
int y2;
fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
const uchar *s1 = data->texture.scanLine(y1);
const uchar *s2 = data->texture.scanLine(y2);
if (layout->bpp == QPixelLayout::BPP32) {
sbuf1[i * 2 + 0] = ((const uint*)s1)[x1];
sbuf1[i * 2 + 1] = ((const uint*)s1)[x2];
sbuf2[i * 2 + 0] = ((const uint*)s2)[x1];
sbuf2[i * 2 + 1] = ((const uint*)s2)[x2];
} else {
sbuf1[i * 2 + 0] = fetch(s1, x1);
sbuf1[i * 2 + 1] = fetch(s1, x2);
sbuf2[i * 2 + 0] = fetch(s2, x1);
sbuf2[i * 2 + 1] = fetch(s2, x2);
}
fx += fdx;
fy += fdy;
}
layout->convertToARGB64PM((QRgba64 *)buf1, sbuf1, len * 2, clut, 0);
layout->convertToARGB64PM((QRgba64 *)buf2, sbuf2, len * 2, clut, 0);
for (int i = 0; i < len; ++i) {
int distx = (fracX & 0x0000ffff);
int disty = (fracY & 0x0000ffff);
int distx = (fx & 0x0000ffff);
int disty = (fy & 0x0000ffff);
b[i] = interpolate_4_pixels_rgb64((QRgba64 *)buf1 + i*2, (QRgba64 *)buf2 + i*2, distx, disty);
fracX += fdx;
fracY += fdy;
fx += fdx;
fy += fdy;
}
length -= len;
@ -3238,6 +3165,12 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
}
}
} else {
const QTextureData &image = data->texture;
const qreal fdx = data->m11;
const qreal fdy = data->m12;
const qreal fdw = data->m13;
qreal fx = data->m21 * cy + data->m11 * cx + data->dx;
qreal fy = data->m22 * cy + data->m12 * cx + data->dy;
qreal fw = data->m23 * cy + data->m13 * cx + data->m33;
@ -3267,8 +3200,8 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
distxs[i] = int((px - x1) * (1<<16));
distys[i] = int((py - y1) * (1<<16));
fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
const uchar *s1 = data->texture.scanLine(y1);
const uchar *s2 = data->texture.scanLine(y2);