Remove last uses of interpolate_4_pixels_16 on SSE2 and NEON

With SSE2 or NEON interpolate_4_pixels is faster than
interpolate_4_pixels_16, and using it saves a branch of duplicated code.

Similar changes had already been done other places it was used, those
have been updated to follow a similar logic.

Change-Id: I040d96480f7f925f659602f66f931d28b59312a5
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
This commit is contained in:
Allan Sandfeld Jensen 2018-03-23 10:19:11 +01:00
parent a5e2ec5cdb
commit 4b6542c9ff
2 changed files with 52 additions and 53 deletions

View File

@ -2249,17 +2249,13 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper(uint *
#endif
while (b < boundedEnd) {
int x = (fx >> 16);
#if defined(__SSE2__) || defined(__ARM_NEON__)
int distx8 = (fx & 0x0000ffff) >> 8;
*b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8);
#else
uint tl = s1[x];
uint tr = s1[x + 1];
uint bl = s2[x];
uint br = s2[x + 1];
int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
*b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
#endif
if (hasFastInterpolate4()) {
int distx8 = (fx & 0x0000ffff) >> 8;
*b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8);
} else {
int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
*b = interpolate_4_pixels_16(s1[x], s1[x + 1], s2[x], s2[x + 1], distx4, disty4);
}
fx += fdx;
++b;
}
@ -2273,14 +2269,13 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper(uint *
uint tr = s1[x2];
uint bl = s2[x1];
uint br = s2[x2];
#if defined(__SSE2__) || defined(__ARM_NEON__)
// The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
int distx8 = (fx & 0x0000ffff) >> 8;
*b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
#else
int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
*b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
#endif
if (hasFastInterpolate4()) {
int distx8 = (fx & 0x0000ffff) >> 8;
*b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
} else {
int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
*b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
}
fx += fdx;
++b;
}
@ -2345,15 +2340,15 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint
uint tr = s1[x2];
uint bl = s2[x1];
uint br = s2[x2];
#if defined(__SSE2__) || defined(__ARM_NEON__)
int distx = (fx & 0x0000ffff) >> 8;
int disty = (fy & 0x0000ffff) >> 8;
*b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
#else
int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
*b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
#endif
if (hasFastInterpolate4()) {
int distx = (fx & 0x0000ffff) >> 8;
int disty = (fy & 0x0000ffff) >> 8;
*b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
} else {
int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
*b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
}
fx += fdx;
fy += fdy;
++b;
@ -2495,19 +2490,15 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint
const uint *s1 = (const uint *)image.scanLine(y);
const uint *s2 = (const uint *)image.scanLine(y + 1);
#if defined(__SSE2__) || defined(__ARM_NEON__)
int distx = (fx & 0x0000ffff) >> 8;
int disty = (fy & 0x0000ffff) >> 8;
*b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
#else
uint tl = s1[x];
uint tr = s1[x + 1];
uint bl = s2[x];
uint br = s2[x + 1];
int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
*b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
#endif
if (hasFastInterpolate4()) {
int distx = (fx & 0x0000ffff) >> 8;
int disty = (fy & 0x0000ffff) >> 8;
*b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
} else {
int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
*b = interpolate_4_pixels_16(s1[x], s1[x + 1], s2[x], s2[x + 1], distx, disty);
}
fx += fdx;
fy += fdy;
@ -2532,16 +2523,15 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint
uint bl = s2[x1];
uint br = s2[x2];
#if defined(__SSE2__) || defined(__ARM_NEON__)
// The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
int distx = (fx & 0x0000ffff) >> 8;
int disty = (fy & 0x0000ffff) >> 8;
*b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
#else
int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
*b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
#endif
if (hasFastInterpolate4()) {
int distx = (fx & 0x0000ffff) >> 8;
int disty = (fy & 0x0000ffff) >> 8;
*b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
} else {
int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
*b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
}
fx += fdx;
fy += fdy;
@ -2939,7 +2929,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0);
layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0);
if (qAbs(data->m22) < qreal(1./8.)) { // scale up more than 8x (on Y)
if (hasFastInterpolate4() || qAbs(data->m22) < qreal(1./8.)) { // scale up more than 8x (on Y)
int disty = (fy & 0x0000ffff) >> 8;
for (int i = 0; i < len; ++i) {
int distx = (fx & 0x0000ffff) >> 8;
@ -2974,7 +2964,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0);
layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0);
if (qAbs(data->m11) < qreal(1./8.)|| qAbs(data->m22) < qreal(1./8.)) {
if (hasFastInterpolate4() || qAbs(data->m11) < qreal(1./8.) || qAbs(data->m22) < qreal(1./8.)) {
// If we are zooming more than 8 times, we use 8bit precision for the position.
for (int i = 0; i < len; ++i) {
int distx = (fx & 0x0000ffff) >> 8;

View File

@ -685,6 +685,9 @@ static inline uint interpolate_4_pixels(const uint t[], const uint b[], uint dis
__m128i vb = _mm_loadl_epi64((const __m128i*)b);
return interpolate_4_pixels_sse2(vt, vb, distx, disty);
}
static constexpr inline bool hasFastInterpolate4() { return true; }
#elif defined(__ARM_NEON__)
static Q_ALWAYS_INLINE uint interpolate_4_pixels_neon(uint32x2_t vt32, uint32x2_t vb32, uint distx, uint disty)
{
@ -717,6 +720,9 @@ static inline uint interpolate_4_pixels(const uint t[], const uint b[], uint dis
uint32x2_t vb32 = vld1_u32(b);
return interpolate_4_pixels_neon(vt32, vb32, distx, disty);
}
static constexpr inline bool hasFastInterpolate4() { return true; }
#else
static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty)
{
@ -731,6 +737,9 @@ static inline uint interpolate_4_pixels(const uint t[], const uint b[], uint dis
{
return interpolate_4_pixels(t[0], t[1], b[0], b[1], distx, disty);
}
static constexpr inline bool hasFastInterpolate4() { return false; }
#endif
#if Q_BYTE_ORDER == Q_BIG_ENDIAN