Aarch64: vectorize ascii de-/encoding.
This works only on Aarch64, because the vaddv instruction is only available on 64bit ARM. Doing something equivalent on 32bit ARM has the high chance to run into micro-architecture differences: on an Cortex-a8, transferring a single vector element from NEON to the regular CPU registers takes 20 cycles(!). Change-Id: Iccbfe84da82abb9b10f3f3dc35c8b950df69e251 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
parent
7c264643f6
commit
923d869b0f
@ -52,7 +52,8 @@ enum { Endian = 0, Data = 1 };
|
||||
|
||||
static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
|
||||
|
||||
#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
|
||||
#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
|
||||
|| (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64))
|
||||
static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) Q_DECL_NOTHROW
|
||||
{
|
||||
uint result = qCountLeadingZeroBits(v);
|
||||
@ -62,7 +63,9 @@ static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) Q_DECL_NOTHROW
|
||||
result ^= sizeof(unsigned) * 8 - 1;
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
|
||||
static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
|
||||
{
|
||||
// do sixteen characters at a time
|
||||
@ -149,6 +152,74 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const
|
||||
}
|
||||
return src == end;
|
||||
}
|
||||
#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
|
||||
static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
|
||||
{
|
||||
uint16x8_t maxAscii = vdupq_n_u16(0x7f);
|
||||
uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
|
||||
uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
|
||||
|
||||
// do sixteen characters at a time
|
||||
for ( ; end - src >= 16; src += 16, dst += 16) {
|
||||
// load 2 lanes (or: "load interleaved")
|
||||
uint16x8x2_t in = vld2q_u16(src);
|
||||
|
||||
// check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
|
||||
// add those together into a scalar, and merge the scalars.
|
||||
uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
|
||||
| vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
|
||||
|
||||
// merge the two lanes by shifting the values of the second by 8 and inserting them
|
||||
uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
|
||||
|
||||
// store, even if there are non-ASCII characters here
|
||||
vst1q_u8(dst, vreinterpretq_u8_u16(out));
|
||||
|
||||
if (nonAscii) {
|
||||
// find the next probable ASCII character
|
||||
// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
|
||||
// characters still coming
|
||||
nextAscii = src + qBitScanReverse(nonAscii) + 1;
|
||||
|
||||
nonAscii = qCountTrailingZeroBits(nonAscii);
|
||||
dst += nonAscii;
|
||||
src += nonAscii;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return src == end;
|
||||
}
|
||||
|
||||
static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
|
||||
{
|
||||
// do eight characters at a time
|
||||
uint8x8_t msb_mask = vdup_n_u8(0x80);
|
||||
uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
|
||||
for ( ; end - src >= 8; src += 8, dst += 8) {
|
||||
uint8x8_t c = vld1_u8(src);
|
||||
uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
|
||||
if (!n) {
|
||||
// store
|
||||
vst1q_u16(dst, vmovl_u8(c));
|
||||
continue;
|
||||
}
|
||||
|
||||
// copy the front part that is still ASCII
|
||||
while (!(n & 1)) {
|
||||
*dst++ = *src++;
|
||||
n >>= 1;
|
||||
}
|
||||
|
||||
// find the next probable ASCII character
|
||||
// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
|
||||
// characters still coming
|
||||
n = qBitScanReverse(n);
|
||||
nextAscii = src + n + 1;
|
||||
return false;
|
||||
|
||||
}
|
||||
return src == end;
|
||||
}
|
||||
#else
|
||||
static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user