Optimize blend_transformed_tiled_argb().

Profiling indicates that this function is one of two hot spots causing
a noticeable latency when changing KDE virtual desktops.

Instead of computing two modulos per pixel in the inner loop, it is
possible to compute the modulos outside the loop and compute a modulo
sum in the inner loop for a reasonable speedup.

Change-Id: Ic4217b7686e031d7673b3e10aa977dae263096dc
Reviewed-by: Gunnar Sletta <gunnar.sletta@jollamobile.com>
This commit is contained in:
Sami Liedes 2013-07-29 13:40:08 +03:00 committed by The Qt Project
parent 18c04d0ab6
commit 0e65cec6b4

View File

@ -5116,13 +5116,13 @@ static void blend_transformed_tiled_argb(int count, const QSpan *spans, void *us
int l = qMin(length, buffer_size);
const uint *end = buffer + l;
uint *b = buffer;
int px16 = x % (image_width << 16);
int py16 = y % (image_height << 16);
int px_delta = fdx % (image_width << 16);
int py_delta = fdy % (image_height << 16);
while (b < end) {
int px = x >> 16;
int py = y >> 16;
px %= image_width;
py %= image_height;
if (px < 0) px += image_width;
if (py < 0) py += image_height;
int px = px16 >> 16;
int py = py16 >> 16;
int y_offset = py * scanline_offset;
Q_ASSERT(px >= 0 && px < image_width);
@ -5131,6 +5131,14 @@ static void blend_transformed_tiled_argb(int count, const QSpan *spans, void *us
*b = image_bits[y_offset + px];
x += fdx;
y += fdy;
px16 += px_delta;
if (px16 >= image_width << 16)
px16 -= image_width << 16;
py16 += py_delta;
if (py16 >= image_height << 16)
py16 -= image_height << 16;
if (px16 < 0) px16 += image_width << 16;
if (py16 < 0) py16 += image_height << 16;
++b;
}
func(target, buffer, l, coverage);