Short-cut SSE4 unpremultiply
Even with SSE4 optimized unpremultiply it is still significantly faster to skip the calculation on alpha values 0 and 255. Change-Id: Iafe658fea8eacf35a857f292952b0c1ee056139c Reviewed-by: Gunnar Sletta <gunnar@sletta.org>
This commit is contained in:
parent
ce302a53df
commit
bc162382e5
@ -242,6 +242,8 @@ QT_FUNCTION_TARGET(SSE4_1)
|
|||||||
inline QRgb qUnpremultiply_sse4(QRgb p)
|
inline QRgb qUnpremultiply_sse4(QRgb p)
|
||||||
{
|
{
|
||||||
const uint alpha = qAlpha(p);
|
const uint alpha = qAlpha(p);
|
||||||
|
if (alpha == 255 || alpha == 0)
|
||||||
|
return p;
|
||||||
const uint invAlpha = qt_inv_premul_factor[alpha];
|
const uint invAlpha = qt_inv_premul_factor[alpha];
|
||||||
const __m128i via = _mm_set1_epi32(invAlpha);
|
const __m128i via = _mm_set1_epi32(invAlpha);
|
||||||
const __m128i vr = _mm_set1_epi32(0x8000);
|
const __m128i vr = _mm_set1_epi32(0x8000);
|
||||||
@ -250,8 +252,8 @@ inline QRgb qUnpremultiply_sse4(QRgb p)
|
|||||||
vl = _mm_add_epi32(vl, vr);
|
vl = _mm_add_epi32(vl, vr);
|
||||||
vl = _mm_srai_epi32(vl, 16);
|
vl = _mm_srai_epi32(vl, 16);
|
||||||
vl = _mm_insert_epi32(vl, alpha, 3);
|
vl = _mm_insert_epi32(vl, alpha, 3);
|
||||||
vl = _mm_packus_epi32(vl, _mm_setzero_si128());
|
vl = _mm_packus_epi32(vl, vl);
|
||||||
vl = _mm_packus_epi16(vl, _mm_setzero_si128());
|
vl = _mm_packus_epi16(vl, vl);
|
||||||
return _mm_cvtsi128_si32(vl);
|
return _mm_cvtsi128_si32(vl);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user