Optimize gamma-table lookup on SSE2 and NEON
Speeds up gamma-corrected text rendering. Change-Id: I38c12ff52f4601853c3f3524de2761a932111160 Reviewed-by: Erik Verbruggen <erik.verbruggen@qt.io>
This commit is contained in:
parent
bde6a04949
commit
9d27aec869
@ -55,6 +55,11 @@
|
||||
#include <QtGui/qrgb.h>
|
||||
#include <QtGui/qrgba64.h>
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#include <emmintrin.h>
|
||||
#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
class Q_GUI_EXPORT QColorProfile
|
||||
@ -67,82 +72,165 @@ public:
|
||||
|
||||
QRgba64 toLinear64(QRgb rgb32) const
|
||||
{
|
||||
ushort r = m_toLinear[qRed(rgb32) << 4];
|
||||
ushort g = m_toLinear[qGreen(rgb32) << 4];
|
||||
ushort b = m_toLinear[qBlue(rgb32) << 4];
|
||||
#if defined(__SSE2__)
|
||||
__m128i v = _mm_cvtsi32_si128(rgb32);
|
||||
v = _mm_unpacklo_epi8(v, _mm_setzero_si128());
|
||||
const __m128i vidx = _mm_slli_epi16(v, 4);
|
||||
const int ridx = _mm_extract_epi16(vidx, 2);
|
||||
const int gidx = _mm_extract_epi16(vidx, 1);
|
||||
const int bidx = _mm_extract_epi16(vidx, 0);
|
||||
v = _mm_slli_epi16(v, 8); // a * 256
|
||||
v = _mm_insert_epi16(v, m_toLinear[ridx], 0);
|
||||
v = _mm_insert_epi16(v, m_toLinear[gidx], 1);
|
||||
v = _mm_insert_epi16(v, m_toLinear[bidx], 2);
|
||||
v = _mm_add_epi16(v, _mm_srli_epi16(v, 8));
|
||||
QRgba64 rgba64;
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i *>(&rgba64), v);
|
||||
return rgba64;
|
||||
#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
|
||||
uint8x8_t v8 = vreinterpret_u8_u32(vmov_n_u32(rgb32));
|
||||
uint16x4_t v16 = vget_low_u16(vmovl_u8(v8));
|
||||
const uint16x4_t vidx = vshl_n_u16(v16, 4);
|
||||
const int ridx = vget_lane_u16(vidx, 2);
|
||||
const int gidx = vget_lane_u16(vidx, 1);
|
||||
const int bidx = vget_lane_u16(vidx, 0);
|
||||
v16 = vshl_n_u16(v16, 8); // a * 256
|
||||
v16 = vset_lane_u16(m_toLinear[ridx], v16, 0);
|
||||
v16 = vset_lane_u16(m_toLinear[gidx], v16, 1);
|
||||
v16 = vset_lane_u16(m_toLinear[bidx], v16, 2);
|
||||
v16 = vadd_u16(v16, vshr_n_u16(v16, 8));
|
||||
return QRgba64::fromRgba64(vget_lane_u64(vreinterpret_u64_u16(v16), 0));
|
||||
#else
|
||||
uint r = m_toLinear[qRed(rgb32) << 4];
|
||||
uint g = m_toLinear[qGreen(rgb32) << 4];
|
||||
uint b = m_toLinear[qBlue(rgb32) << 4];
|
||||
r = r + (r >> 8);
|
||||
g = g + (g >> 8);
|
||||
b = b + (b >> 8);
|
||||
return QRgba64::fromRgba64(r, g, b, qAlpha(rgb32) * 257);
|
||||
#endif
|
||||
}
|
||||
|
||||
QRgb toLinear(QRgb rgb32) const
|
||||
{
|
||||
uchar r = (m_toLinear[qRed(rgb32) << 4] + 0x80) >> 8;
|
||||
uchar g = (m_toLinear[qGreen(rgb32) << 4] + 0x80) >> 8;
|
||||
uchar b = (m_toLinear[qBlue(rgb32) << 4] + 0x80) >> 8;
|
||||
return qRgba(r, g, b, qAlpha(rgb32));
|
||||
return convertWithTable(rgb32, m_toLinear);
|
||||
}
|
||||
|
||||
QRgba64 toLinear(QRgba64 rgb64) const
|
||||
{
|
||||
ushort r = rgb64.red();
|
||||
ushort g = rgb64.green();
|
||||
ushort b = rgb64.blue();
|
||||
r = r - (r >> 8);
|
||||
g = g - (g >> 8);
|
||||
b = b - (b >> 8);
|
||||
r = m_toLinear[r >> 4];
|
||||
g = m_toLinear[g >> 4];
|
||||
b = m_toLinear[b >> 4];
|
||||
r = r + (r >> 8);
|
||||
g = g + (g >> 8);
|
||||
b = b + (b >> 8);
|
||||
return QRgba64::fromRgba64(r, g, b, rgb64.alpha());
|
||||
return convertWithTable(rgb64, m_toLinear);
|
||||
}
|
||||
|
||||
QRgb fromLinear64(QRgba64 rgb64) const
|
||||
{
|
||||
ushort r = rgb64.red();
|
||||
ushort g = rgb64.green();
|
||||
ushort b = rgb64.blue();
|
||||
#if defined(__SSE2__)
|
||||
__m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgb64));
|
||||
v = _mm_sub_epi16(v, _mm_srli_epi16(v, 8));
|
||||
const __m128i vidx = _mm_srli_epi16(v, 4);
|
||||
const int ridx = _mm_extract_epi16(vidx, 0);
|
||||
const int gidx = _mm_extract_epi16(vidx, 1);
|
||||
const int bidx = _mm_extract_epi16(vidx, 2);
|
||||
v = _mm_insert_epi16(v, m_fromLinear[ridx], 2);
|
||||
v = _mm_insert_epi16(v, m_fromLinear[gidx], 1);
|
||||
v = _mm_insert_epi16(v, m_fromLinear[bidx], 0);
|
||||
v = _mm_add_epi16(v, _mm_set1_epi16(0x80));
|
||||
v = _mm_srli_epi16(v, 8);
|
||||
v = _mm_packus_epi16(v, v);
|
||||
return _mm_cvtsi128_si32(v);
|
||||
#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
|
||||
uint16x4_t v = vreinterpret_u16_u64(vmov_n_u64(rgb64));
|
||||
v = vsub_u16(v, vshr_n_u16(v, 8));
|
||||
const uint16x4_t vidx = vshr_n_u16(v, 4);
|
||||
const int ridx = vget_lane_u16(vidx, 0);
|
||||
const int gidx = vget_lane_u16(vidx, 1);
|
||||
const int bidx = vget_lane_u16(vidx, 2);
|
||||
v = vset_lane_u16(m_fromLinear[ridx], v, 2);
|
||||
v = vset_lane_u16(m_fromLinear[gidx], v, 1);
|
||||
v = vset_lane_u16(m_fromLinear[bidx], v, 0);
|
||||
uint8x8_t v8 = vrshrn_n_u16(vcombine_u16(v, v), 8);
|
||||
return vget_lane_u32(vreinterpret_u32_u8(v8), 0);
|
||||
#else
|
||||
uint a = rgb64.alpha();
|
||||
uint r = rgb64.red();
|
||||
uint g = rgb64.green();
|
||||
uint b = rgb64.blue();
|
||||
a = a - (a >> 8);
|
||||
r = r - (r >> 8);
|
||||
g = g - (g >> 8);
|
||||
b = b - (b >> 8);
|
||||
a = (a + 0x80) >> 8;
|
||||
r = (m_fromLinear[r >> 4] + 0x80) >> 8;
|
||||
g = (m_fromLinear[g >> 4] + 0x80) >> 8;
|
||||
b = (m_fromLinear[b >> 4] + 0x80) >> 8;
|
||||
return qRgba(r, g, b, rgb64.alpha8());
|
||||
return (a << 24) | (r << 16) | (g << 8) | b;
|
||||
#endif
|
||||
}
|
||||
|
||||
QRgb fromLinear(QRgb rgb32) const
|
||||
{
|
||||
uchar r = (m_fromLinear[qRed(rgb32) << 4] + 0x80) >> 8;
|
||||
uchar g = (m_fromLinear[qGreen(rgb32) << 4] + 0x80) >> 8;
|
||||
uchar b = (m_fromLinear[qBlue(rgb32) << 4] + 0x80) >> 8;
|
||||
return qRgba(r, g, b, qAlpha(rgb32));
|
||||
return convertWithTable(rgb32, m_fromLinear);
|
||||
}
|
||||
|
||||
QRgba64 fromLinear(QRgba64 rgb64) const
|
||||
{
|
||||
return convertWithTable(rgb64, m_fromLinear);
|
||||
}
|
||||
|
||||
private:
|
||||
QColorProfile() { }
|
||||
|
||||
Q_ALWAYS_INLINE static QRgb convertWithTable(QRgb rgb32, const ushort *table)
|
||||
{
|
||||
const int r = (table[qRed(rgb32) << 4] + 0x80) >> 8;
|
||||
const int g = (table[qGreen(rgb32) << 4] + 0x80) >> 8;
|
||||
const int b = (table[qBlue(rgb32) << 4] + 0x80) >> 8;
|
||||
return (rgb32 & 0xff000000) | (r << 16) | (g << 8) | b;
|
||||
}
|
||||
Q_ALWAYS_INLINE static QRgba64 convertWithTable(QRgba64 rgb64, const ushort *table)
|
||||
{
|
||||
#if defined(__SSE2__)
|
||||
__m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgb64));
|
||||
v = _mm_sub_epi16(v, _mm_srli_epi16(v, 8));
|
||||
const __m128i vidx = _mm_srli_epi16(v, 4);
|
||||
const int ridx = _mm_extract_epi16(vidx, 2);
|
||||
const int gidx = _mm_extract_epi16(vidx, 1);
|
||||
const int bidx = _mm_extract_epi16(vidx, 0);
|
||||
v = _mm_insert_epi16(v, table[ridx], 2);
|
||||
v = _mm_insert_epi16(v, table[gidx], 1);
|
||||
v = _mm_insert_epi16(v, table[bidx], 0);
|
||||
v = _mm_add_epi16(v, _mm_srli_epi16(v, 8));
|
||||
QRgba64 rgba64;
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i *>(&rgba64), v);
|
||||
return rgba64;
|
||||
#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
|
||||
uint16x4_t v = vreinterpret_u16_u64(vmov_n_u64(rgb64));
|
||||
v = vsub_u16(v, vshr_n_u16(v, 8));
|
||||
const uint16x4_t vidx = vshr_n_u16(v, 4);
|
||||
const int ridx = vget_lane_u16(vidx, 2);
|
||||
const int gidx = vget_lane_u16(vidx, 1);
|
||||
const int bidx = vget_lane_u16(vidx, 0);
|
||||
v = vset_lane_u16(table[ridx], v, 2);
|
||||
v = vset_lane_u16(table[gidx], v, 1);
|
||||
v = vset_lane_u16(table[bidx], v, 0);
|
||||
v = vadd_u16(v, vshr_n_u16(v, 8));
|
||||
return QRgba64::fromRgba64(vget_lane_u64(vreinterpret_u64_u16(v), 0));
|
||||
#else
|
||||
ushort r = rgb64.red();
|
||||
ushort g = rgb64.green();
|
||||
ushort b = rgb64.blue();
|
||||
r = r - (r >> 8);
|
||||
g = g - (g >> 8);
|
||||
b = b - (b >> 8);
|
||||
r = m_fromLinear[r >> 4];
|
||||
g = m_fromLinear[g >> 4];
|
||||
b = m_fromLinear[b >> 4];
|
||||
r = table[r >> 4];
|
||||
g = table[g >> 4];
|
||||
b = table[b >> 4];
|
||||
r = r + (r >> 8);
|
||||
g = g + (g >> 8);
|
||||
b = b + (b >> 8);
|
||||
return QRgba64::fromRgba64(r, g, b, rgb64.alpha());
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
QColorProfile() { }
|
||||
|
||||
// We translate to 0-65280 (255*256) instead to 0-65535 to make simple
|
||||
// shifting an accurate conversion.
|
||||
// We translate from 0-4080 (255*16) for the same speed up, and to keep
|
||||
|
Loading…
Reference in New Issue
Block a user