Improve accuracy in fast path bilinear sampling

Adds rounding before using the optimized low accuracy interpolation,
this reduces the magnitude of error in the scaled result from ~4 bits
to just 2 bits.

Change-Id: Ie4e618bf5b1f4a74367aa419ebbd534cc6a846b3
Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
This commit is contained in:
Allan Sandfeld Jensen 2016-07-06 17:11:59 +02:00 committed by Allan Sandfeld Jensen
parent 529b1c9e2a
commit b91f86a212
2 changed files with 62 additions and 39 deletions

View File

@ -1827,9 +1827,9 @@ static const QRgba64 *QT_FASTCALL fetchTransformed64(QRgba64 *buffer, const Oper
/** \internal /** \internal
interpolate 4 argb pixels with the distx and disty factor. interpolate 4 argb pixels with the distx and disty factor.
distx and disty bust be between 0 and 16 distx and disty must be between 0 and 16
*/ */
static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, int distx, int disty) static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, uint distx, uint disty)
{ {
uint distxy = distx * disty; uint distxy = distx * disty;
//idistx * disty = (16-distx) * disty = 16*disty - distxy //idistx * disty = (16-distx) * disty = 16*disty - distxy
@ -2176,7 +2176,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2); fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
const uint *s1 = (const uint *)data->texture.scanLine(y1); const uint *s1 = (const uint *)data->texture.scanLine(y1);
const uint *s2 = (const uint *)data->texture.scanLine(y2); const uint *s2 = (const uint *)data->texture.scanLine(y2);
int disty = (fy & 0x0000ffff) >> 12; int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
if (blendType != BlendTransformedBilinearTiled) { if (blendType != BlendTransformedBilinearTiled) {
#define BILINEAR_DOWNSCALE_BOUNDS_PROLOG \ #define BILINEAR_DOWNSCALE_BOUNDS_PROLOG \
@ -2190,7 +2190,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
uint tr = s1[x2]; \ uint tr = s1[x2]; \
uint bl = s2[x1]; \ uint bl = s2[x1]; \
uint br = s2[x2]; \ uint br = s2[x2]; \
int distx = (fx & 0x0000ffff) >> 12; \ int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; \
*b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); \ *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); \
fx += fdx; \ fx += fdx; \
++b; \ ++b; \
@ -2209,6 +2209,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
const __m128i v_256 = _mm_set1_epi16(256); const __m128i v_256 = _mm_set1_epi16(256);
const __m128i v_disty = _mm_set1_epi16(disty); const __m128i v_disty = _mm_set1_epi16(disty);
const __m128i v_fdx = _mm_set1_epi32(fdx*4); const __m128i v_fdx = _mm_set1_epi32(fdx*4);
const __m128i v_fx_r = _mm_set1_epi32(0x8);
__m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx); __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
while (b < boundedEnd) { while (b < boundedEnd) {
@ -2222,7 +2223,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]); const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]);
const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]); const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]);
__m128i v_distx = _mm_srli_epi16(v_fx, 12); __m128i v_distx = _mm_srli_epi16(v_fx, 8);
v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fx_r), 4);
v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
@ -2252,6 +2254,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
} }
const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff); const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
const int32x4_t v_fx_r = vdupq_n_s32(0x0800);
while (b < boundedEnd) { while (b < boundedEnd) {
@ -2260,7 +2263,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
Vect_buffer v_fx_shifted; Vect_buffer v_fx_shifted;
v_fx_shifted.vect = vshrq_n_s32(v_fx.vect, 16); v_fx_shifted.vect = vshrq_n_s32(v_fx.vect, 16);
int32x4_t v_distx = vshrq_n_s32(vandq_s32(v_fx.vect, v_ffff_mask), 12); int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx.vect, v_ffff_mask), v_fx_r), 12);
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
int x1 = v_fx_shifted.i[i]; int x1 = v_fx_shifted.i[i];
@ -2290,7 +2293,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
uint tr = s1[x2]; uint tr = s1[x2];
uint bl = s2[x1]; uint bl = s2[x1];
uint br = s2[x2]; uint br = s2[x2];
int distx = (fx & 0x0000ffff) >> 12; int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
*b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
fx += fdx; fx += fdx;
++b; ++b;
@ -2362,6 +2365,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
const __m128i v_256 = _mm_set1_epi16(256); const __m128i v_256 = _mm_set1_epi16(256);
const __m128i v_fdx = _mm_set1_epi32(fdx*4); const __m128i v_fdx = _mm_set1_epi32(fdx*4);
const __m128i v_fdy = _mm_set1_epi32(fdy*4); const __m128i v_fdy = _mm_set1_epi32(fdy*4);
const __m128i v_fxy_r = _mm_set1_epi32(0x8);
__m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx); __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
__m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy); __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy);
@ -2396,6 +2400,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
__m128i v_distx = _mm_srli_epi16(v_fx, 12); __m128i v_distx = _mm_srli_epi16(v_fx, 12);
__m128i v_disty = _mm_srli_epi16(v_fy, 12); __m128i v_disty = _mm_srli_epi16(v_fy, 12);
v_distx = _mm_srli_epi16(_mm_add_epi32(v_fx, v_fxy_r), 4);
v_disty = _mm_srli_epi16(_mm_add_epi32(v_fy, v_fxy_r), 4);
v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0)); v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
@ -2434,8 +2440,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
int disty = (fy & 0x0000ffff) >> 8; int disty = (fy & 0x0000ffff) >> 8;
*b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
#else #else
int distx = (fx & 0x0000ffff) >> 12; int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
int disty = (fy & 0x0000ffff) >> 12; int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
*b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
#endif #endif
@ -2664,13 +2670,13 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
fracX += fdx; fracX += fdx;
} }
} else { //scale down } else { //scale down
int disty = (fy & 0x0000ffff) >> 12; int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
uint tl = buf1[i * 2 + 0]; uint tl = buf1[i * 2 + 0];
uint tr = buf1[i * 2 + 1]; uint tr = buf1[i * 2 + 1];
uint bl = buf2[i * 2 + 0]; uint bl = buf2[i * 2 + 0];
uint br = buf2[i * 2 + 1]; uint br = buf2[i * 2 + 1];
int distx = (fracX & 0x0000ffff) >> 12; int distx = ((fracX & 0x0000ffff) + 0x0800) >> 12;
b[i] = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); b[i] = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
fracX += fdx; fracX += fdx;
} }
@ -2736,8 +2742,8 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
uint bl = buf2[i * 2 + 0]; uint bl = buf2[i * 2 + 0];
uint br = buf2[i * 2 + 1]; uint br = buf2[i * 2 + 1];
int distx = (fracX & 0x0000ffff) >> 12; int distx = ((fracX & 0x0000ffff) + 0x0800) >> 12;
int disty = (fracY & 0x0000ffff) >> 12; int disty = ((fracY & 0x0000ffff) + 0x0800) >> 12;
b[i] = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); b[i] = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
fracX += fdx; fracX += fdx;

View File

@ -106,6 +106,7 @@ private slots:
void smoothScale(); void smoothScale();
void smoothScale2_data(); void smoothScale2_data();
void smoothScale2(); void smoothScale2();
void smoothScale3_data();
void smoothScale3(); void smoothScale3();
void smoothScale4(); void smoothScale4();
@ -1715,9 +1716,12 @@ static inline int rand8()
return int(256. * (qrand() / (RAND_MAX + 1.0))); return int(256. * (qrand() / (RAND_MAX + 1.0)));
} }
// compares img.scale against the bilinear filtering used by QPainter void tst_QImage::smoothScale3_data()
void tst_QImage::smoothScale3()
{ {
QTest::addColumn<QImage>("img");
QTest::addColumn<qreal>("scale_x");
QTest::addColumn<qreal>("scale_y");
QImage img(128, 128, QImage::Format_RGB32); QImage img(128, 128, QImage::Format_RGB32);
for (int y = 0; y < img.height(); ++y) { for (int y = 0; y < img.height(); ++y) {
for (int x = 0; x < img.width(); ++x) { for (int x = 0; x < img.width(); ++x) {
@ -1730,16 +1734,29 @@ void tst_QImage::smoothScale3()
} }
} }
qreal scales[2] = { .5, 2 }; QTest::newRow("(0.5, 0.5)") << img << qreal(0.5) << qreal(0.5);
QTest::newRow("(0.5, 1.0)") << img << qreal(0.5) << qreal(1.0);
QTest::newRow("(1.0, 0.5)") << img << qreal(1.0) << qreal(0.5);
QTest::newRow("(0.5, 2.0)") << img << qreal(0.5) << qreal(2.0);
QTest::newRow("(1.0, 2.0)") << img << qreal(1.0) << qreal(2.0);
QTest::newRow("(2.0, 0.5)") << img << qreal(2.0) << qreal(0.5);
QTest::newRow("(2.0, 1.0)") << img << qreal(2.0) << qreal(1.0);
QTest::newRow("(2.0, 2.0)") << img << qreal(2) << qreal(2);
}
// compares img.scale against the bilinear filtering used by QPainter
void tst_QImage::smoothScale3()
{
QFETCH(QImage, img);
QFETCH(qreal, scale_x);
QFETCH(qreal, scale_y);
for (int i = 0; i < 2; ++i) { QImage a = img.scaled(img.width() * scale_x, img.height() * scale_y, Qt::IgnoreAspectRatio, Qt::SmoothTransformation);
QImage a = img.scaled(img.size() * scales[i], Qt::IgnoreAspectRatio, Qt::SmoothTransformation);
QImage b(a.size(), a.format()); QImage b(a.size(), a.format());
b.fill(0x0); b.fill(0x0);
QPainter p(&b); QPainter p(&b);
p.setRenderHint(QPainter::SmoothPixmapTransform); p.setRenderHint(QPainter::SmoothPixmapTransform);
p.scale(scales[i], scales[i]); p.scale(scale_x, scale_y);
p.drawImage(0, 0, img); p.drawImage(0, 0, img);
p.end(); p.end();
int err = 0; int err = 0;
@ -1750,17 +1767,17 @@ void tst_QImage::smoothScale3()
QRgb cb = b.pixel(x, y); QRgb cb = b.pixel(x, y);
// tolerate a little bit of rounding errors // tolerate a little bit of rounding errors
int tolerance = 3;
bool r = true; bool r = true;
r &= qAbs(qRed(ca) - qRed(cb)) <= 18; r &= qAbs(qRed(ca) - qRed(cb)) <= tolerance;
r &= qAbs(qGreen(ca) - qGreen(cb)) <= 18; r &= qAbs(qGreen(ca) - qGreen(cb)) <= tolerance;
r &= qAbs(qBlue(ca) - qBlue(cb)) <= 18; r &= qAbs(qBlue(ca) - qBlue(cb)) <= tolerance;
if (!r) if (!r)
err++; err++;
} }
} }
QCOMPARE(err, 0); QCOMPARE(err, 0);
} }
}
// Tests smooth upscale is smooth // Tests smooth upscale is smooth
void tst_QImage::smoothScale4() void tst_QImage::smoothScale4()