[+] ARM based AuBitsReverse optimization

This commit is contained in:
Reece Wilson 2024-09-22 16:56:42 +01:00
parent cc69e9821c
commit 21f35e0bea

View File

@ -54,6 +54,22 @@ namespace __audetail
#endif
}
#if defined(AURORA_ARCH_ARM) || defined(AURORA_ARCH_ARM32) || defined(AURORA_ARCH_ARM64)
#define AU_RBITS_CONSTEXPR
#if !defined(AU_RBITS_ARM_SMIS) && (defined(AURORA_COMPILER_CLANG) || defined(AURORA_COMPILER_GCC))
#define AU_RBITS_ARM_GCC
#elif !defined(AU_RBITS_ARM_SMIS)
// https://github.com/ARM-software/CMSIS_5/blob/develop/CMSIS/Core/Include/cmsis_gcc.h
// assume we have this
#define AU_RBITS_ARM_SMIS
#endif
#define AU_RBITS_ARM
#endif
#if !defined(AU_RBITS_CONSTEXPR)
#define AU_RBITS_CONSTEXPR constexpr
#endif
template <class T>
static auline bool AuTestBit(T value, AuUInt8 idx)
{
@ -375,24 +391,43 @@ static AuUInt8 AuPopCnt(T in)
}
template<typename T>
constexpr T AuBitReverse(T uBits);
AU_RBITS_CONSTEXPR T AuBitReverse(T uBits);
template <>
constexpr AuUInt8 AuBitReverse(AuUInt8 uBits)
AU_RBITS_CONSTEXPR AuUInt8 AuBitReverse(AuUInt8 uBits)
{
#if !defined(_AU_DO_NOT_USE_BITFLIP_U8)
#if defined(AU_RBITS_ARM)
AuUInt32 uOutput { AuBitReverse<AuUInt32>(uBits) };
return AuUInt8((uOutput >> 24) & 0xffu);
#endif
uBits = ((uBits >> 1) & 0x55u) | ((uBits << 1) & 0xAAu);
uBits = ((uBits >> 2) & 0x33u) | ((uBits << 2) & 0xCCu);
uBits = ((uBits >> 4) & 0x0Fu) | ((uBits << 4) & 0xF0u);
return uBits;
#endif
return (__audetail::kFlipBitsU4Lookup[uBits & 0b1111] << 4) |
(__audetail::kFlipBitsU4Lookup[uBits >> 4] );
}
template <>
constexpr AuUInt16 AuBitReverse(AuUInt16 uBits)
AU_RBITS_CONSTEXPR AuUInt16 AuBitReverse(AuUInt16 uBits)
{
#if !defined(_AU_DO_NOT_USE_BITFLIP_U16)
uBits = ((uBits >> 1) & 0x5555u) | ((uBits & 0x5555u) << 1);
uBits = ((uBits >> 2) & 0x3333u) | ((uBits & 0x3333u) << 2);
uBits = ((uBits >> 4) & 0x0f0fu) | ((uBits & 0x0f0fu) << 4);
uBits = ((uBits >> 8) & 0x00ffu) | ((uBits & 0x00ffu) << 8);
#if defined(AU_RBITS_ARM)
AuUInt32 uOutput { AuBitReverse<AuUInt32>(uBits) };
return AuUInt16((uOutput >> 16) & 0xffffu);
#endif
uBits = ((uBits & 0xAAAAu) >> 1) | ((uBits & 0x5555u) << 1);
uBits = ((uBits & 0xCCCCu) >> 2) | ((uBits & 0x3333u) << 2);
uBits = ((uBits & 0xF0F0u) >> 4) | ((uBits & 0x0F0Fu) << 4);
uBits = ((uBits & 0xFF00u) >> 8) | ((uBits & 0x00FFu) << 8);
return uBits;
#endif
@ -407,13 +442,22 @@ constexpr AuUInt16 AuBitReverse(AuUInt16 uBits)
}
template <>
constexpr AuUInt32 AuBitReverse(AuUInt32 uBits)
AU_RBITS_CONSTEXPR AuUInt32 AuBitReverse(AuUInt32 uBits)
{
#if !defined(_AU_DO_NOT_USE_BITFLIP_U32)
uBits = ((uBits >> 1) & 0x55555555u) | ((uBits & 0x55555555u) << 1);
uBits = ((uBits >> 2) & 0x33333333u) | ((uBits & 0x33333333u) << 2);
uBits = ((uBits >> 4) & 0x0f0f0f0fu) | ((uBits & 0x0f0f0f0fu) << 4);
uBits = ((uBits >> 8) & 0x00ff00ffu) | ((uBits & 0x00ff00ffu) << 8);
#if defined(AU_RBITS_ARM_SMIS)
return __RBIT(uBits);
#elif defined(AU_RBITS_ARM_GCC)
AuUInt32 uOutput {};
asm("rbit %0,%1" : "=r"(uOutput) : "r"(uBits));
return uOutput;
#endif
uBits = ((uBits & 0xAAAAAAAAu) >> 1) | ((uBits & 0x55555555u) << 1);
uBits = ((uBits & 0xCCCCCCCCu) >> 2) | ((uBits & 0x33333333u) << 2);
uBits = ((uBits & 0xF0F0F0F0u) >> 4) | ((uBits & 0x0F0F0F0Fu) << 4);
uBits = ((uBits & 0xFF00FF00u) >> 8) | ((uBits & 0x00FF00FFu) << 8);
uBits = ((uBits >> 16) & 0xffffu) | ((uBits & 0xffffu) << 16);
return uBits;
#endif
@ -434,20 +478,20 @@ constexpr AuUInt32 AuBitReverse(AuUInt32 uBits)
}
template <>
constexpr AuUInt64 AuBitReverse(AuUInt64 uBits)
AU_RBITS_CONSTEXPR AuUInt64 AuBitReverse(AuUInt64 uBits)
{
return AuUInt64(AuBitReverse(AuUInt32((uBits >> 32ull) & 0xFFFFFFFFul))) |
AuUInt64(AuUInt64(AuBitReverse(AuUInt32(uBits & 0xFFFFFFFFul))) << 32UL);
}
template<typename T>
constexpr T AuBitsReverse(T uBits)
AU_RBITS_CONSTEXPR T AuBitsReverse(T uBits)
{
return AuBitReverse<T>(uBits);
}
template<typename T>
constexpr T AuReverseBits(T uBits)
AU_RBITS_CONSTEXPR T AuReverseBits(T uBits)
{
return AuBitReverse<T>(uBits);
}