Optimized findMSB and findLSB

This commit is contained in:
Christophe Riccio 2014-11-22 20:14:48 +01:00
parent 20bdab33dd
commit 0bffce4f4b
6 changed files with 1127 additions and 494 deletions

View File

@ -87,6 +87,117 @@ namespace detail
return (v & Mask) + ((v >> Shift) & Mask);
}
};
template <typename genIUType, size_t Bits>
struct compute_findLSB
{
GLM_FUNC_QUALIFIER static int call(genIUType Value)
{
if(Value == 0)
return -1;
return glm::bitCount(~Value & (Value - static_cast<genIUType>(1)));
}
};
# if (GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & (GLM_COMPILER_VC | GLM_COMPILER_APPLE_CLANG | GLM_COMPILER_LLVM))
template <typename genIUType>
struct compute_findLSB<genIUType, 32>
{
GLM_FUNC_QUALIFIER static int call(genIUType Value)
{
unsigned long Result(0);
unsigned char IsNotNull = _BitScanForward(&Result, *reinterpret_cast<unsigned long*>(&Value));
return IsNotNull ? int(Result) : -1;
}
};
template <typename genIUType>
struct compute_findLSB<genIUType, 64>
{
GLM_FUNC_QUALIFIER static int call(genIUType Value)
{
unsigned long Result(0);
unsigned char IsNotNull = _BitScanForward64(&Result, *reinterpret_cast<unsigned __int64*>(&Value));
return IsNotNull ? int(Result) : -1;
}
};
# endif
template <typename T, glm::precision P, template <class, glm::precision> class vecType, bool EXEC = true>
struct compute_findMSB_step_vec
{
GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & x, T Shift)
{
return x | (x >> Shift);
}
};
template <typename T, glm::precision P, template <class, glm::precision> class vecType>
struct compute_findMSB_step_vec<T, P, vecType, false>
{
GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & x, T)
{
return x;
}
};
template <typename T, glm::precision P, template <class, glm::precision> class vecType, std::size_t>
struct compute_findMSB_vec
{
GLM_FUNC_QUALIFIER static vecType<int, P> call(vecType<T, P> const & vec)
{
vecType<T, P> x(vec);
x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >= 8>::call(x, static_cast<T>( 1));
x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >= 8>::call(x, static_cast<T>( 2));
x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >= 8>::call(x, static_cast<T>( 4));
x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >= 16>::call(x, static_cast<T>( 8));
x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >= 32>::call(x, static_cast<T>(16));
x = compute_findMSB_step_vec<T, P, vecType, sizeof(T) * 8 >= 64>::call(x, static_cast<T>(32));
return vecType<int, P>(sizeof(T) * 8 - 1) - glm::bitCount(~x);
}
};
# if (GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & (GLM_COMPILER_VC | GLM_COMPILER_APPLE_CLANG | GLM_COMPILER_LLVM))
template <typename genIUType>
GLM_FUNC_QUALIFIER int compute_findMSB_32(genIUType Value)
{
unsigned long Result(0);
unsigned char IsNotNull = _BitScanReverse(&Result, *reinterpret_cast<unsigned long*>(&Value));
return IsNotNull ? int(Result) : -1;
}
template <typename genIUType>
GLM_FUNC_QUALIFIER int compute_findMSB_64(genIUType Value)
{
unsigned long Result(0);
unsigned char IsNotNull = _BitScanReverse64(&Result, *reinterpret_cast<unsigned __int64*>(&Value));
return IsNotNull ? int(Result) : -1;
}
template <typename T, glm::precision P, template <class, glm::precision> class vecType>
struct compute_findMSB_vec<T, P, vecType, 32>
{
GLM_FUNC_QUALIFIER static int call(vecType<T, P> const & x)
{
return detail::functor1<int, T, P, vecType>::call(compute_findMSB_32, x);
}
};
template <typename T, glm::precision P, template <class, glm::precision> class vecType>
struct compute_findMSB_vec<T, P, vecType, 64>
{
GLM_FUNC_QUALIFIER static int call(vecType<T, P> const & x)
{
return detail::functor1<int, T, P, vecType>::call(compute_findMSB_64, x);
}
};
# endif
}//namespace detail
// uaddCarry
@ -248,12 +359,8 @@ namespace detail
GLM_FUNC_QUALIFIER int findLSB(genIUType Value)
{
GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findLSB' only accept integer values");
if(Value == 0)
return -1;
genIUType Bit;
for(Bit = genIUType(0); !(Value & (1 << Bit)); ++Bit){}
return Bit;
return detail::compute_findLSB<genIUType, sizeof(genIUType) * 8>::call(Value);
}
template <typename T, precision P, template <typename, precision> class vecType>
@ -265,89 +372,19 @@ namespace detail
}
// findMSB
#if (GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & GLM_COMPILER_VC)
template <typename genIUType>
GLM_FUNC_QUALIFIER int findMSB(genIUType Value)
{
GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
if(Value == 0)
return -1;
unsigned long Result(0);
_BitScanReverse(&Result, Value);
return int(Result);
}
/*
// __builtin_clz seems to be buggy as it crasks for some values, from 0x00200000 to 80000000
#elif((GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & GLM_COMPILER_GCC) && (GLM_COMPILER >= GLM_COMPILER_GCC40))
template <typename genIUType>
GLM_FUNC_QUALIFIER int findMSB
(
genIUType const & Value
)
{
GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
if(Value == 0)
return -1;
// clz returns the number or trailing 0-bits; see
// http://gcc.gnu.org/onlinedocs/gcc-4.7.1/gcc/Other-Builtins.html
//
// NoteBecause __builtin_clz only works for unsigned ints, this
// implementation will not work for 64-bit integers.
//
return 31 - __builtin_clzl(Value);
}
*/
#else
/* SSE implementation idea
__m128i const Zero = _mm_set_epi32( 0, 0, 0, 0);
__m128i const One = _mm_set_epi32( 1, 1, 1, 1);
__m128i Bit = _mm_set_epi32(-1, -1, -1, -1);
__m128i Tmp = _mm_set_epi32(Value, Value, Value, Value);
__m128i Mmi = Zero;
for(int i = 0; i < 32; ++i)
{
__m128i Shilt = _mm_and_si128(_mm_cmpgt_epi32(Tmp, One), One);
Tmp = _mm_srai_epi32(Tmp, One);
Bit = _mm_add_epi32(Bit, _mm_and_si128(Shilt, i));
Mmi = _mm_and_si128(Mmi, One);
}
return Bit;
*/
template <typename genIUType>
GLM_FUNC_QUALIFIER int findMSB(genIUType Value)
GLM_FUNC_QUALIFIER int findMSB(genIUType x)
{
GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
if(Value == genIUType(0) || Value == genIUType(-1))
return -1;
else if(Value > 0)
{
genIUType Bit = genIUType(-1);
for(genIUType tmp = Value; tmp > 0; tmp >>= 1, ++Bit){}
return Bit;
}
else //if(Value < 0)
{
int const BitCount(sizeof(genIUType) * 8);
int MostSignificantBit(-1);
for(int BitIndex(0); BitIndex < BitCount; ++BitIndex)
MostSignificantBit = (Value & (1 << BitIndex)) ? MostSignificantBit : BitIndex;
assert(MostSignificantBit >= 0);
return MostSignificantBit;
}
return findMSB(tvec1<genIUType>(x)).x;
}
#endif//(GLM_COMPILER)
template <typename T, precision P, template <typename, precision> class vecType>
GLM_FUNC_QUALIFIER vecType<int, P> findMSB(vecType<T, P> const & x)
{
return detail::functor1<int, T, P, vecType>::call(findMSB, x);
GLM_STATIC_ASSERT(std::numeric_limits<T>::is_integer, "'findMSB' only accept integer values");
return detail::compute_findMSB_vec<T, P, vecType, sizeof(T) * 8>::call(x);
}
}//namespace glm

View File

@ -67,16 +67,16 @@ Improvements:
- Undetected C++ compiler automatically compile with GLM_FORCE_CXX98 and
GLM_FORCE_PURE
- Added not function (from GLSL specification) on VC12
- Optimized bitfield operations
- Optimized bitfieldReverse and bitCount functions
- Optimized findLSB and findMSB functions.
- Optimized matrix-vector multiple performance with Cuda #257, #258
- Reduced integer type redifinitions #233
- Rewrited of GTX_fast_trigonometry #264 #265
- Made types trivially copyable #263
- Removed <iostream> in GLM tests
- Used std features within GLM without redeclaring
- Optimized glm::cot #272
- Optimized glm::sign #272
- Optimized cot function #272
- Optimized sign function #272
Fixes:
- Fixed std::nextafter not supported with C++11 on Android #217

View File

@ -22,6 +22,7 @@ glmCreateTestGTC(core_func_geometric)
glmCreateTestGTC(core_func_integer)
glmCreateTestGTC(core_func_integer_bit_count)
glmCreateTestGTC(core_func_integer_find_lsb)
glmCreateTestGTC(core_func_integer_find_msb)
glmCreateTestGTC(core_func_matrix)
glmCreateTestGTC(core_func_noise)
glmCreateTestGTC(core_func_packing)

View File

@ -8,6 +8,7 @@
///////////////////////////////////////////////////////////////////////////////////////////////////
#include <glm/integer.hpp>
#include <glm/vector_relational.hpp>
#include <glm/gtc/vec1.hpp>
#include <vector>
#include <ctime>
@ -555,6 +556,19 @@ namespace findMSB
genType Return;
};
template <typename genIUType>
GLM_FUNC_QUALIFIER int findMSB_intrinsic(genIUType Value)
{
GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
if(Value == 0)
return -1;
unsigned long Result(0);
_BitScanReverse(&Result, Value);
return int(Result);
}
template <typename genIUType>
GLM_FUNC_QUALIFIER int findMSB_095(genIUType Value)
{
@ -583,27 +597,17 @@ namespace findMSB
GLM_FUNC_QUALIFIER int findMSB_nlz1(genIUType x)
{
GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findMSB' only accept integer values");
/*
int Result = 0;
for(std::size_t i = 0, n = sizeof(genIUType) * 8; i < n; ++i)
Result = Value & static_cast<genIUType>(1 << i) ? static_cast<int>(i) : Result;
return Result;
*/
/*
genIUType Bit = genIUType(-1);
for(genIUType tmp = Value; tmp > 0; tmp >>= 1, ++Bit){}
return Bit;
*/
int n;
if (x == 0) return(32);
n = 0;
if (x == 0)
return -1;
int n = 0;
if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
if (x <= 0x7FFFFFFF) {n = n + 1;}
return n;
return 31 - n;
}
int findMSB_nlz2(unsigned int x)
@ -617,69 +621,20 @@ namespace findMSB
y = x >> 4; if (y != 0) {n = n - 4; x = y;}
y = x >> 2; if (y != 0) {n = n - 2; x = y;}
y = x >> 1; if (y != 0) return n - 2;
return n - x;
return 32 - (n - x);
}
int perf_950()
int findMSB_pop(unsigned int x)
{
type<glm::uint> const Data[] =
{
//{0x00000000, -1},
{0x00000001, 0},
{0x00000002, 1},
{0x00000003, 1},
{0x00000004, 2},
{0x00000005, 2},
{0x00000007, 2},
{0x00000008, 3},
{0x00000010, 4},
{0x00000020, 5},
{0x00000040, 6},
{0x00000080, 7},
{0x00000100, 8},
{0x00000200, 9},
{0x00000400, 10},
{0x00000800, 11},
{0x00001000, 12},
{0x00002000, 13},
{0x00004000, 14},
{0x00008000, 15},
{0x00010000, 16},
{0x00020000, 17},
{0x00040000, 18},
{0x00080000, 19},
{0x00100000, 20},
{0x00200000, 21},
{0x00400000, 22},
{0x00800000, 23},
{0x01000000, 24},
{0x02000000, 25},
{0x04000000, 26},
{0x08000000, 27},
{0x10000000, 28},
{0x20000000, 29},
{0x40000000, 30}
};
int Error(0);
std::clock_t Timestamps1 = std::clock();
for(std::size_t k = 0; k < 1000000; ++k)
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result = findMSB_095(Data[i].Value);
Error += Data[i].Return == Result ? 0 : 1;
}
std::clock_t Timestamps2 = std::clock();
std::printf("findMSB - 0.9.5: %d clocks\n", static_cast<unsigned int>(Timestamps2 - Timestamps1));
return Error;
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >>16);
return 31 - glm::bitCount(~x);
}
int perf_ops()
int perf_int()
{
type<int> const Data[] =
{
@ -721,10 +676,20 @@ namespace findMSB
};
int Error(0);
std::size_t const Count(1000000);
std::clock_t Timestamps0 = std::clock();
for(std::size_t k = 0; k < Count; ++k)
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result = glm::findMSB(Data[i].Value);
Error += Data[i].Return == Result ? 0 : 1;
}
std::clock_t Timestamps1 = std::clock();
for(std::size_t k = 0; k < 1000000; ++k)
for(std::size_t k = 0; k < Count; ++k)
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result = findMSB_nlz1(Data[i].Value);
@ -733,70 +698,109 @@ namespace findMSB
std::clock_t Timestamps2 = std::clock();
for(std::size_t k = 0; k < Count; ++k)
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result = findMSB_nlz2(Data[i].Value);
Error += Data[i].Return == Result ? 0 : 1;
}
std::clock_t Timestamps3 = std::clock();
for(std::size_t k = 0; k < Count; ++k)
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result = findMSB_095(Data[i].Value);
Error += Data[i].Return == Result ? 0 : 1;
}
std::clock_t Timestamps4 = std::clock();
for(std::size_t k = 0; k < Count; ++k)
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result = findMSB_intrinsic(Data[i].Value);
Error += Data[i].Return == Result ? 0 : 1;
}
std::clock_t Timestamps5 = std::clock();
for(std::size_t k = 0; k < Count; ++k)
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result = findMSB_pop(Data[i].Value);
Error += Data[i].Return == Result ? 0 : 1;
}
std::clock_t Timestamps6 = std::clock();
std::printf("glm::findMSB: %d clocks\n", static_cast<unsigned int>(Timestamps1 - Timestamps0));
std::printf("findMSB - nlz1: %d clocks\n", static_cast<unsigned int>(Timestamps2 - Timestamps1));
std::printf("findMSB - nlz2: %d clocks\n", static_cast<unsigned int>(Timestamps3 - Timestamps2));
std::printf("findMSB - 0.9.5: %d clocks\n", static_cast<unsigned int>(Timestamps4 - Timestamps3));
std::printf("findMSB - intrinsics: %d clocks\n", static_cast<unsigned int>(Timestamps5 - Timestamps4));
std::printf("findMSB - pop: %d clocks\n", static_cast<unsigned int>(Timestamps6 - Timestamps5));
return Error;
}
int test_findMSB()
int test_ivec4()
{
type<glm::uint> const Data[] =
type<glm::ivec4> const Data[] =
{
//{0x00000000, -1},
{0x00000001, 0},
{0x00000002, 1},
{0x00000003, 1},
{0x00000004, 2},
{0x00000005, 2},
{0x00000007, 2},
{0x00000008, 3},
{0x00000010, 4},
{0x00000020, 5},
{0x00000040, 6},
{0x00000080, 7},
{0x00000100, 8},
{0x00000200, 9},
{0x00000400, 10},
{0x00000800, 11},
{0x00001000, 12},
{0x00002000, 13},
{0x00004000, 14},
{0x00008000, 15},
{0x00010000, 16},
{0x00020000, 17},
{0x00040000, 18},
{0x00080000, 19},
{0x00100000, 20},
{0x00200000, 21},
{0x00400000, 22},
{0x00800000, 23},
{0x01000000, 24},
{0x02000000, 25},
{0x04000000, 26},
{0x08000000, 27},
{0x10000000, 28},
{0x20000000, 29},
{0x40000000, 30}
{glm::ivec4(0x00000000), glm::ivec4(-1)},
{glm::ivec4(0x00000001), glm::ivec4( 0)},
{glm::ivec4(0x00000002), glm::ivec4( 1)},
{glm::ivec4(0x00000003), glm::ivec4( 1)},
{glm::ivec4(0x00000004), glm::ivec4( 2)},
{glm::ivec4(0x00000005), glm::ivec4( 2)},
{glm::ivec4(0x00000007), glm::ivec4( 2)},
{glm::ivec4(0x00000008), glm::ivec4( 3)},
{glm::ivec4(0x00000010), glm::ivec4( 4)},
{glm::ivec4(0x00000020), glm::ivec4( 5)},
{glm::ivec4(0x00000040), glm::ivec4( 6)},
{glm::ivec4(0x00000080), glm::ivec4( 7)},
{glm::ivec4(0x00000100), glm::ivec4( 8)},
{glm::ivec4(0x00000200), glm::ivec4( 9)},
{glm::ivec4(0x00000400), glm::ivec4(10)},
{glm::ivec4(0x00000800), glm::ivec4(11)},
{glm::ivec4(0x00001000), glm::ivec4(12)},
{glm::ivec4(0x00002000), glm::ivec4(13)},
{glm::ivec4(0x00004000), glm::ivec4(14)},
{glm::ivec4(0x00008000), glm::ivec4(15)},
{glm::ivec4(0x00010000), glm::ivec4(16)},
{glm::ivec4(0x00020000), glm::ivec4(17)},
{glm::ivec4(0x00040000), glm::ivec4(18)},
{glm::ivec4(0x00080000), glm::ivec4(19)},
{glm::ivec4(0x00100000), glm::ivec4(20)},
{glm::ivec4(0x00200000), glm::ivec4(21)},
{glm::ivec4(0x00400000), glm::ivec4(22)},
{glm::ivec4(0x00800000), glm::ivec4(23)},
{glm::ivec4(0x01000000), glm::ivec4(24)},
{glm::ivec4(0x02000000), glm::ivec4(25)},
{glm::ivec4(0x04000000), glm::ivec4(26)},
{glm::ivec4(0x08000000), glm::ivec4(27)},
{glm::ivec4(0x10000000), glm::ivec4(28)},
{glm::ivec4(0x20000000), glm::ivec4(29)},
{glm::ivec4(0x40000000), glm::ivec4(30)}
};
int Error(0);
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<glm::ivec4>); ++i)
{
int Result = glm::findMSB(Data[i].Value);
Error += Data[i].Return == Result ? 0 : 1;
assert(!Error);
glm::ivec4 Result0 = glm::findMSB(Data[i].Value);
Error += glm::all(glm::equal(Data[i].Return, Result0)) ? 0 : 1;
}
return Error;
}
int test_nlz1()
int test_int()
{
type<glm::uint> const Data[] =
{
//{0x00000000, -1},
{0x00000000, -1},
{0x00000001, 0},
{0x00000002, 1},
{0x00000003, 1},
@ -837,8 +841,38 @@ namespace findMSB
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result = findMSB_nlz2(Data[i].Value);
Error += Data[i].Return == Result ? 0 : 1;
int Result0 = glm::findMSB(Data[i].Value);
Error += Data[i].Return == Result0 ? 0 : 1;
}
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result0 = findMSB_nlz1(Data[i].Value);
Error += Data[i].Return == Result0 ? 0 : 1;
}
/*
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result0 = findMSB_nlz2(Data[i].Value);
Error += Data[i].Return == Result0 ? 0 : 1;
}
*/
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result0 = findMSB_095(Data[i].Value);
Error += Data[i].Return == Result0 ? 0 : 1;
}
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result0 = findMSB_intrinsic(Data[i].Value);
Error += Data[i].Return == Result0 ? 0 : 1;
}
for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<int>); ++i)
{
int Result0 = findMSB_pop(Data[i].Value);
Error += Data[i].Return == Result0 ? 0 : 1;
}
return Error;
@ -848,8 +882,8 @@ namespace findMSB
{
int Error(0);
Error += test_findMSB();
//Error += test_nlz1();
Error += test_ivec4();
Error += test_int();
return Error;
}
@ -858,8 +892,7 @@ namespace findMSB
{
int Error(0);
Error += perf_950();
//Error += perf_ops();
Error += perf_int();
return Error;
}
@ -878,10 +911,60 @@ namespace findLSB
{
{0x00000001, 0},
{0x00000003, 0},
{0x00000002, 1}
{0x00000002, 1},
{0x80000000, 31},
{0x00010000, 16},
{0xFFFF0000, 16},
{0xFF000000, 24},
{0xFF00FF00, 8},
{0x00000000, -1}
};
int test()
template <typename genIUType>
GLM_FUNC_QUALIFIER int findLSB_intrinsic(genIUType Value)
{
GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findLSB' only accept integer values");
if(Value == 0)
return -1;
unsigned long Result(0);
_BitScanForward(&Result, Value);
return int(Result);
}
template <typename genIUType>
GLM_FUNC_QUALIFIER int findLSB_095(genIUType Value)
{
GLM_STATIC_ASSERT(std::numeric_limits<genIUType>::is_integer, "'findLSB' only accept integer values");
if(Value == 0)
return -1;
genIUType Bit;
for(Bit = genIUType(0); !(Value & (1 << Bit)); ++Bit){}
return Bit;
}
template <typename genIUType>
GLM_FUNC_QUALIFIER int findLSB_ntz2(genIUType x)
{
if(x == 0)
return -1;
return glm::bitCount(~x & (x - static_cast<genIUType>(1)));
}
template <typename genIUType>
GLM_FUNC_QUALIFIER int findLSB_branchfree(genIUType x)
{
bool IsNull(x == 0);
int const Keep(!IsNull);
int const Discard(IsNull);
return static_cast<int>(glm::bitCount(~x & (x - static_cast<genIUType>(1)))) * Keep + Discard * -1;
}
int test_int()
{
int Error(0);
@ -889,9 +972,111 @@ namespace findLSB
{
int Result = glm::findLSB(DataI32[i].Value);
Error += DataI32[i].Return == Result ? 0 : 1;
assert(!Error);
}
for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
{
int Result = findLSB_095(DataI32[i].Value);
Error += DataI32[i].Return == Result ? 0 : 1;
}
for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
{
int Result = findLSB_intrinsic(DataI32[i].Value);
Error += DataI32[i].Return == Result ? 0 : 1;
}
for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
{
int Result = findLSB_ntz2(DataI32[i].Value);
Error += DataI32[i].Return == Result ? 0 : 1;
}
for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
{
int Result = findLSB_branchfree(DataI32[i].Value);
Error += DataI32[i].Return == Result ? 0 : 1;
}
return Error;
}
int test()
{
int Error(0);
Error += test_int();
return Error;
}
int perf_int()
{
int Error(0);
std::size_t const Count(10000000);
std::clock_t Timestamps0 = std::clock();
for(std::size_t k = 0; k < Count; ++k)
for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
{
int Result = glm::findLSB(DataI32[i].Value);
Error += DataI32[i].Return == Result ? 0 : 1;
}
std::clock_t Timestamps1 = std::clock();
for(std::size_t k = 0; k < Count; ++k)
for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
{
int Result = findLSB_095(DataI32[i].Value);
Error += DataI32[i].Return == Result ? 0 : 1;
}
std::clock_t Timestamps2 = std::clock();
for(std::size_t k = 0; k < Count; ++k)
for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
{
int Result = findLSB_intrinsic(DataI32[i].Value);
Error += DataI32[i].Return == Result ? 0 : 1;
}
std::clock_t Timestamps3 = std::clock();
for(std::size_t k = 0; k < Count; ++k)
for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
{
int Result = findLSB_ntz2(DataI32[i].Value);
Error += DataI32[i].Return == Result ? 0 : 1;
}
std::clock_t Timestamps4 = std::clock();
for(std::size_t k = 0; k < Count; ++k)
for(std::size_t i = 0; i < sizeof(DataI32) / sizeof(type<int>); ++i)
{
int Result = findLSB_branchfree(DataI32[i].Value);
Error += DataI32[i].Return == Result ? 0 : 1;
}
std::clock_t Timestamps5 = std::clock();
std::printf("glm::findLSB: %d clocks\n", static_cast<unsigned int>(Timestamps1 - Timestamps0));
std::printf("findLSB - 0.9.5: %d clocks\n", static_cast<unsigned int>(Timestamps2 - Timestamps1));
std::printf("findLSB - intrinsics: %d clocks\n", static_cast<unsigned int>(Timestamps3 - Timestamps2));
std::printf("findLSB - ntz2: %d clocks\n", static_cast<unsigned int>(Timestamps4 - Timestamps3));
std::printf("findLSB - branchfree: %d clocks\n", static_cast<unsigned int>(Timestamps5 - Timestamps4));
return Error;
}
int perf()
{
int Error(0);
Error += perf_int();
return Error;
}
}//findLSB
@ -1324,6 +1509,7 @@ int main()
Error += ::bitCount::perf();
Error += ::bitfieldReverse::perf();
Error += ::findMSB::perf();
Error += ::findLSB::perf();
# endif
return Error;

View File

@ -7,123 +7,14 @@
// File : test/core/func_integer_find_lsb.cpp
///////////////////////////////////////////////////////////////////////////////////////////////////
// This has the programs for computing the number of leading zeros
// This has the programs for computing the number of trailing zeros
// in a word.
// Max line length is 57, to fit in hacker.book.
// Compile with g++, not gcc.
#include <cstdio>
#include <cstdlib> // To define "exit", req'd by XLC.
#include <cstdlib> //To define "exit", req'd by XLC.
#include <ctime>
#define LE 1 // 1 for little-endian, 0 for big-endian.
int pop(unsigned x) {
x = x - ((x >> 1) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
x = (x + (x >> 4)) & 0x0F0F0F0F;
x = x + (x << 8);
x = x + (x << 16);
return x >> 24;
}
int nlz1(unsigned x) {
int n;
if (x == 0) return(32);
n = 0;
if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
if (x <= 0x7FFFFFFF) {n = n + 1;}
return n;
}
int nlz1a(unsigned x) {
int n;
/* if (x == 0) return(32); */
if ((int)x <= 0) return (~x >> 26) & 32;
n = 1;
if ((x >> 16) == 0) {n = n +16; x = x <<16;}
if ((x >> 24) == 0) {n = n + 8; x = x << 8;}
if ((x >> 28) == 0) {n = n + 4; x = x << 4;}
if ((x >> 30) == 0) {n = n + 2; x = x << 2;}
n = n - (x >> 31);
return n;
}
// On basic Risc, 12 to 20 instructions.
int nlz2(unsigned x) {
unsigned y;
int n;
n = 32;
y = x >>16; if (y != 0) {n = n -16; x = y;}
y = x >> 8; if (y != 0) {n = n - 8; x = y;}
y = x >> 4; if (y != 0) {n = n - 4; x = y;}
y = x >> 2; if (y != 0) {n = n - 2; x = y;}
y = x >> 1; if (y != 0) return n - 2;
return n - x;
}
// As above but coded as a loop for compactness:
// 23 to 33 basic Risc instructions.
int nlz2a(unsigned x) {
unsigned y;
int n, c;
n = 32;
c = 16;
do {
y = x >> c; if (y != 0) {n = n - c; x = y;}
c = c >> 1;
} while (c != 0);
return n - x;
}
int nlz3(int x) {
int y, n;
n = 0;
y = x;
L: if (x < 0) return n;
if (y == 0) return 32 - n;
n = n + 1;
x = x << 1;
y = y >> 1;
goto L;
}
int nlz4(unsigned x) {
int y, m, n;
y = -(x >> 16); // If left half of x is 0,
m = (y >> 16) & 16; // set n = 16. If left half
n = 16 - m; // is nonzero, set n = 0 and
x = x >> m; // shift x right 16.
// Now x is of the form 0000xxxx.
y = x - 0x100; // If positions 8-15 are 0,
m = (y >> 16) & 8; // add 8 to n and shift x left 8.
n = n + m;
x = x << m;
y = x - 0x1000; // If positions 12-15 are 0,
m = (y >> 16) & 4; // add 4 to n and shift x left 4.
n = n + m;
x = x << m;
y = x - 0x4000; // If positions 14-15 are 0,
m = (y >> 16) & 2; // add 2 to n and shift x left 2.
n = n + m;
x = x << m;
y = x >> 14; // Set y = 0, 1, 2, or 3.
m = y & ~(y >> 1); // Set m = 0, 1, 2, or 2 resp.
return n + 2 - m;
}
int nlz5(unsigned x) {
int nlz(unsigned x) {
int pop(unsigned x);
x = x | (x >> 1);
@ -134,172 +25,239 @@ int nlz5(unsigned x) {
return pop(~x);
}
/* The four programs below are not valid ANSI C programs. This is
because they refer to the same storage locations as two different types.
However, they work with xlc/AIX, gcc/AIX, and gcc/NT. If you try to
code them more compactly by declaring a variable xx to be "double," and
then using
int pop(unsigned x) {
x = x - ((x >> 1) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
x = (x + (x >> 4)) & 0x0F0F0F0F;
x = x + (x << 8);
x = x + (x << 16);
return x >> 24;
}
n = 1054 - (*((unsigned *)&xx + LE) >> 20);
int ntz1(unsigned x) {
return 32 - nlz(~x & (x-1));
}
then you are violating not only the rule above, but also the ANSI C
rule that pointer arithmetic can be performed only on pointers to
array elements.
When coded with the above statement, the program fails with xlc,
gcc/AIX, and gcc/NT, at some optimization levels.
BTW, these programs use the "anonymous union" feature of C++, not
available in C. */
int ntz2(unsigned x) {
return pop(~x & (x - 1));
}
int nlz6(unsigned k) {
union {
unsigned asInt[2];
double asDouble;
};
int ntz3(unsigned x) {
int n;
asDouble = (double)k + 0.5;
n = 1054 - (asInt[LE] >> 20);
if (x == 0) return(32);
n = 1;
if ((x & 0x0000FFFF) == 0) {n = n +16; x = x >>16;}
if ((x & 0x000000FF) == 0) {n = n + 8; x = x >> 8;}
if ((x & 0x0000000F) == 0) {n = n + 4; x = x >> 4;}
if ((x & 0x00000003) == 0) {n = n + 2; x = x >> 2;}
return n - (x & 1);
}
int ntz4(unsigned x) {
unsigned y;
int n;
if (x == 0) return 32;
n = 31;
y = x <<16; if (y != 0) {n = n -16; x = y;}
y = x << 8; if (y != 0) {n = n - 8; x = y;}
y = x << 4; if (y != 0) {n = n - 4; x = y;}
y = x << 2; if (y != 0) {n = n - 2; x = y;}
y = x << 1; if (y != 0) {n = n - 1;}
return n;
}
int nlz7(unsigned k) {
union {
unsigned asInt[2];
double asDouble;
};
int ntz4a(unsigned x) {
unsigned y;
int n;
asDouble = (double)k;
n = 1054 - (asInt[LE] >> 20);
n = (n & 31) + (n >> 9);
if (x == 0) return 32;
n = 31;
y = x <<16; if (y != 0) {n = n -16; x = y;}
y = x << 8; if (y != 0) {n = n - 8; x = y;}
y = x << 4; if (y != 0) {n = n - 4; x = y;}
y = x << 2; if (y != 0) {n = n - 2; x = y;}
n = n - ((x << 1) >> 31);
return n;
}
/* In single precision, round-to-nearest mode, the basic method fails for:
k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF,
07FFFFFC <= k <= 07FFFFFF,
0FFFFFF8 <= k <= 0FFFFFFF,
...
7FFFFFC0 <= k <= 7FFFFFFF.
FFFFFF80 <= k <= FFFFFFFF.
For k = 0 it gives 158, and for the other values it is too low by 1. */
int nlz8(unsigned k) {
union {
unsigned asInt;
float asFloat;
};
int n;
k = k & ~(k >> 1); /* Fix problem with rounding. */
asFloat = (float)k + 0.5f;
n = 158 - (asInt >> 23);
return n;
int ntz5(char x)
{
if (x & 15) {
if (x & 3) {
if (x & 1) return 0;
else return 1;
}
else if (x & 4) return 2;
else return 3;
}
else if (x & 0x30) {
if (x & 0x10) return 4;
else return 5;
}
else if (x & 0x40) return 6;
else if (x) return 7;
else return 8;
}
/* The example below shows how to make a macro for nlz. It uses an
extension to the C and C++ languages that is provided by the GNU C/C++
compiler, namely, that of allowing statements and declarations in
expressions (see "Using and Porting GNU CC", by Richard M. Stallman
(1998). The underscores are necessary to protect against the
possibility that the macro argument will conflict with one of its local
variables, e.g., NLZ(k). */
int nlz9(unsigned k) {
union {
unsigned asInt;
float asFloat;
};
int ntz6(unsigned x) {
int n;
k = k & ~(k >> 1); /* Fix problem with rounding. */
asFloat = (float)k;
n = 158 - (asInt >> 23);
n = (n & 31) + (n >> 6); /* Fix problem with k = 0. */
return n;
x = ~x & (x - 1);
n = 0; // n = 32;
while(x != 0) { // while (x != 0) {
n = n + 1; // n = n - 1;
x = x >> 1; // x = x + x;
} // }
return n; // return n;
}
/* Below are three nearly equivalent programs for computing the number
of leading zeros in a word. This material is not in HD, but may be in a
future edition.
Immediately below is Robert Harley's algorithm, found at the
comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert
Juffa.
Table entries marked "u" are unused. 14 ops including a multiply,
plus an indexed load.
The smallest multiplier that works is 0x045BCED1 = 17*65*129*513 (all
of form 2**k + 1). There are no multipliers of three terms of the form
2**k +- 1 that work, with a table size of 64 or 128. There are some,
with a table size of 64, if you precede the multiplication with x = x -
(x >> 1), but that seems less elegant. There are also some if you use a
table size of 256, the smallest is 0x01033CBF = 65*255*1025 (this would
save two instructions in the form of this algorithm with the
multiplication expanded into shifts and adds, but the table size is
getting a bit large). */
int ntz6a(unsigned x)
{
int n = 32;
while (x != 0) {
n = n - 1;
x = x + x;
}
return n;
}
/* Dean Gaudet's algorithm. To be most useful there must be a good way
to evaluate the C "conditional expression" (a?b:c construction) without
branching. The result of a?b:c is b if a is true (nonzero), and c if a
is false (0).
For example, a compare to zero op that sets a target GPR to 1 if the
operand is 0, and to 0 if the operand is nonzero, will do it. With this
instruction, the algorithm is entirely branch-free. But the most
interesting thing about it is the high degree of parallelism. All six
lines with conditional expressions can be executed in parallel (on a
machine with sufficient computational units).
Although the instruction count is 30 measured statically, it could
execute in only 10 cycles on a machine with sufficient parallelism.
The first two uses of y can instead be x, which would increase the
useful parallelism on most machines (the assignments to y, bz, and b4
could then all run in parallel). */
int ntz7(unsigned x)
{
unsigned y, bz, b4, b3, b2, b1, b0;
y = x & -x; // Isolate rightmost 1-bit.
bz = y ? 0 : 1; // 1 if y = 0.
b4 = (y & 0x0000FFFF) ? 0 : 16;
b3 = (y & 0x00FF00FF) ? 0 : 8;
b2 = (y & 0x0F0F0F0F) ? 0 : 4;
b1 = (y & 0x33333333) ? 0 : 2;
b0 = (y & 0x55555555) ? 0 : 1;
return bz + b4 + b3 + b2 + b1 + b0;
}
int ntz7_christophe(unsigned x)
{
unsigned y, bz, b4, b3, b2, b1, b0;
y = x & -x; // Isolate rightmost 1-bit.
bz = unsigned(!bool(y)); // 1 if y = 0.
b4 = unsigned(!bool(y & 0x0000FFFF)) * 16;
b3 = unsigned(!bool(y & 0x00FF00FF)) * 8;
b2 = unsigned(!bool(y & 0x0F0F0F0F)) * 4;
b1 = unsigned(!bool(y & 0x33333333)) * 2;
b0 = unsigned(!bool(y & 0x55555555)) * 1;
return bz + b4 + b3 + b2 + b1 + b0;
}
/* Below is David Seal's algorithm, found at
http://www.ciphersbyritter.com/NEWS4/BITCT.HTM Table
entries marked "u" are unused. 6 ops including a
multiply, plus an indexed load. */
#define u 99
int nlz10(unsigned x) {
int ntz8(unsigned x)
{
static char table[64] =
{32, 0, 1,12, 2, 6, u,13, 3, u, 7, u, u, u, u,14,
10, 4, u, u, 8, u, u,25, u, u, u, u, u,21,27,15,
31,11, 5, u, u, u, u, u, 9, u, u,24, u, u,20,26,
30, u, u, u, u,23, u,19, 29, u,22,18,28,17,16, u};
static char table[64] =
{32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u,
u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u,
17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18,
5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u};
x = x | (x >> 1); // Propagate leftmost
x = x | (x >> 2); // 1-bit to the right.
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >>16);
x = x*0x06EB14F9; // Multiplier is 7*255**3.
return table[x >> 26];
x = (x & -x)*0x0450FBAF;
return table[x >> 26];
}
/* Harley's algorithm with multiply expanded.
19 elementary ops plus an indexed load. */
/* Seal's algorithm with multiply expanded.
9 elementary ops plus an indexed load. */
int nlz10a(unsigned x) {
int ntz8a(unsigned x)
{
static char table[64] =
{32, 0, 1,12, 2, 6, u,13, 3, u, 7, u, u, u, u,14,
10, 4, u, u, 8, u, u,25, u, u, u, u, u,21,27,15,
31,11, 5, u, u, u, u, u, 9, u, u,24, u, u,20,26,
30, u, u, u, u,23, u,19, 29, u,22,18,28,17,16, u};
static char table[64] =
{32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u,
u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u,
17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18,
5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u};
x = x | (x >> 1); // Propagate leftmost
x = x | (x >> 2); // 1-bit to the right.
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
x = (x << 3) - x; // Multiply by 7.
x = (x << 8) - x; // Multiply by 255.
x = (x << 8) - x; // Again.
x = (x << 8) - x; // Again.
return table[x >> 26];
x = (x & -x);
x = (x << 4) + x; // x = x*17.
x = (x << 6) + x; // x = x*65.
x = (x << 16) - x; // x = x*65535.
return table[x >> 26];
}
/* Julius Goryavsky's version of Harley's algorithm.
17 elementary ops plus an indexed load, if the machine
has "and not." */
/* Reiser's algorithm. Three ops including a "remainder,"
plus an indexed load. */
int nlz10b(unsigned x) {
int ntz9(unsigned x) {
static char table[64] =
{32,20,19, u, u,18, u, 7, 10,17, u, u,14, u, 6, u,
u, 9, u,16, u, u, 1,26, u,13, u, u,24, 5, u, u,
u,21, u, 8,11, u,15, u, u, u, u, 2,27, 0,25, u,
22, u,12, u, u, 3,28, u, 23, u, 4,29, u, u,30,31};
static char table[37] = {32, 0, 1, 26, 2, 23, 27,
u, 3, 16, 24, 30, 28, 11, u, 13, 4,
7, 17, u, 25, 22, 31, 15, 29, 10, 12,
6, u, 21, 14, 9, 5, 20, 8, 19, 18};
x = x | (x >> 1); // Propagate leftmost
x = x | (x >> 2); // 1-bit to the right.
x = x | (x >> 4);
x = x | (x >> 8);
x = x & ~(x >> 16);
x = x*0xFD7049FF; // Activate this line or the following 3.
// x = (x << 9) - x; // Multiply by 511.
// x = (x << 11) - x; // Multiply by 2047.
// x = (x << 14) - x; // Multiply by 16383.
return table[x >> 26];
x = (x & -x)%37;
return table[x];
}
/* Using a de Bruijn sequence. This is a table lookup with a 32-entry
table. The de Bruijn sequence used here is
0000 0100 1101 0111 0110 0101 0001 1111,
obtained from Danny Dube's October 3, 1997, posting in
comp.compression.research. Thanks to Norbert Juffa for this reference. */
int ntz10(unsigned x) {
static char table[32] =
{ 0, 1, 2,24, 3,19, 6,25, 22, 4,20,10,16, 7,12,26,
31,23,18, 5,21, 9,15,11, 30,17, 8,14,29,13,28,27};
if (x == 0) return 32;
x = (x & -x)*0x04D7651F;
return table[x >> 27];
}
/* Norbert Juffa's code, answer to exercise 1 of Chapter 5 (2nd ed). */
#define SLOW_MUL
int ntz11 (unsigned int n) {
static unsigned char tab[32] =
{ 0, 1, 2, 24, 3, 19, 6, 25,
22, 4, 20, 10, 16, 7, 12, 26,
31, 23, 18, 5, 21, 9, 15, 11,
30, 17, 8, 14, 29, 13, 28, 27
};
unsigned int k;
n = n & (-n); /* isolate lsb */
printf("n = %d\n", n);
#if defined(SLOW_MUL)
k = (n << 11) - n;
k = (k << 2) + k;
k = (k << 8) + n;
k = (k << 5) - k;
#else
k = n * 0x4d7651f;
#endif
return n ? tab[k>>27] : 32;
}
int errors;
@ -308,19 +266,22 @@ void error(int x, int y) {
printf("Error for x = %08x, got %d\n", x, y);
}
/* ------------------------------ main ------------------------------ */
int main()
{
# ifdef GLM_TEST_ENABLE_PERF
int i, n;
static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29,
7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23,
512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16,
65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11,
0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6,
0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3,
0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0,
0xFFFFFFFF,0};
int i, m, n;
static unsigned test[] = {0,32, 1,0, 2,1, 3,0, 4,2, 5,0, 6,1, 7,0,
8,3, 9,0, 16,4, 32,5, 64,6, 128,7, 255,0, 256,8, 512,9, 1024,10,
2048,11, 4096,12, 8192,13, 16384,14, 32768,15, 65536,16,
0x20000,17, 0x40000,18, 0x80000,19, 0x100000,20, 0x200000,21,
0x400000,22, 0x800000,23, 0x1000000,24, 0x2000000,25,
0x4000000,26, 0x8000000,27, 0x10000000,28, 0x20000000,29,
0x40000000,30, 0x80000000,31, 0xFFFFFFF0,4, 0x3000FF00,8,
0xC0000000,30, 0x60000000,29, 0x00011000, 12};
std::size_t const Count = 10000000;
n = sizeof(test)/4;
@ -331,114 +292,115 @@ int main()
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));}
if (ntz1(test[i]) != test[i+1]) error(test[i], ntz1(test[i]));}
TimestampEnd = std::clock();
printf("nlz1: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz1: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));}
if (ntz2(test[i]) != test[i+1]) error(test[i], ntz2(test[i]));}
TimestampEnd = std::clock();
printf("nlz1a: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz2: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));}
if (ntz3(test[i]) != test[i+1]) error(test[i], ntz3(test[i]));}
TimestampEnd = std::clock();
printf("nlz2: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz3: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));}
if (ntz4(test[i]) != test[i+1]) error(test[i], ntz4(test[i]));}
TimestampEnd = std::clock();
printf("nlz2a: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz4: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));}
if (ntz4a(test[i]) != test[i+1]) error(test[i], ntz4a(test[i]));}
TimestampEnd = std::clock();
printf("nlz3: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz4a: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));}
m = test[i+1]; if (m > 8) m = 8;
if (ntz5(test[i]) != m) error(test[i], ntz5(test[i]));}
TimestampEnd = std::clock();
printf("nlz4: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz5: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));}
if (ntz6(test[i]) != test[i+1]) error(test[i], ntz6(test[i]));}
TimestampEnd = std::clock();
printf("nlz5: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz6: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));}
if (ntz6a(test[i]) != test[i+1]) error(test[i], ntz6a(test[i]));}
TimestampEnd = std::clock();
printf("nlz6: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz6a: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));}
if (ntz7(test[i]) != test[i+1]) error(test[i], ntz7(test[i]));}
TimestampEnd = std::clock();
printf("nlz7: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz7: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));}
if (ntz7_christophe(test[i]) != test[i+1]) error(test[i], ntz7(test[i]));}
TimestampEnd = std::clock();
printf("nlz8: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz7_christophe: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));}
if (ntz8(test[i]) != test[i+1]) error(test[i], ntz8(test[i]));}
TimestampEnd = std::clock();
printf("nlz9: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz8: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));}
if (ntz8a(test[i]) != test[i+1]) error(test[i], ntz8a(test[i]));}
TimestampEnd = std::clock();
printf("nlz10: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz8a: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));}
if (ntz9(test[i]) != test[i+1]) error(test[i], ntz9(test[i]));}
TimestampEnd = std::clock();
printf("nlz10a: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz9: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));}
if (ntz10(test[i]) != test[i+1]) error(test[i], ntz10(test[i]));}
TimestampEnd = std::clock();
printf("nlz10b: %d clocks\n", TimestampEnd - TimestampBeg);
printf("ntz10: %d clocks\n", TimestampEnd - TimestampBeg);
if (errors == 0)
printf("Passed all %d cases.\n", sizeof(test)/8);

View File

@ -0,0 +1,447 @@
///////////////////////////////////////////////////////////////////////////////////////////////////
// OpenGL Mathematics Copyright (c) 2005 - 2014 G-Truc Creation (www.g-truc.net)
///////////////////////////////////////////////////////////////////////////////////////////////////
// Created : 2014-10-27
// Updated : 2014-10-27
// Licence : This source is under MIT licence
// File : test/core/func_integer_find_lsb.cpp
///////////////////////////////////////////////////////////////////////////////////////////////////
// This has the programs for computing the number of leading zeros
// in a word.
// Max line length is 57, to fit in hacker.book.
// Compile with g++, not gcc.
#include <cstdio>
#include <cstdlib> // To define "exit", req'd by XLC.
#include <ctime>
#define LE 1 // 1 for little-endian, 0 for big-endian.
int pop(unsigned x) {
x = x - ((x >> 1) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
x = (x + (x >> 4)) & 0x0F0F0F0F;
x = x + (x << 8);
x = x + (x << 16);
return x >> 24;
}
int nlz1(unsigned x) {
int n;
if (x == 0) return(32);
n = 0;
if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
if (x <= 0x7FFFFFFF) {n = n + 1;}
return n;
}
int nlz1a(unsigned x) {
int n;
/* if (x == 0) return(32); */
if ((int)x <= 0) return (~x >> 26) & 32;
n = 1;
if ((x >> 16) == 0) {n = n +16; x = x <<16;}
if ((x >> 24) == 0) {n = n + 8; x = x << 8;}
if ((x >> 28) == 0) {n = n + 4; x = x << 4;}
if ((x >> 30) == 0) {n = n + 2; x = x << 2;}
n = n - (x >> 31);
return n;
}
// On basic Risc, 12 to 20 instructions.
int nlz2(unsigned x) {
unsigned y;
int n;
n = 32;
y = x >>16; if (y != 0) {n = n -16; x = y;}
y = x >> 8; if (y != 0) {n = n - 8; x = y;}
y = x >> 4; if (y != 0) {n = n - 4; x = y;}
y = x >> 2; if (y != 0) {n = n - 2; x = y;}
y = x >> 1; if (y != 0) return n - 2;
return n - x;
}
// As above but coded as a loop for compactness:
// 23 to 33 basic Risc instructions.
int nlz2a(unsigned x) {
unsigned y;
int n, c;
n = 32;
c = 16;
do {
y = x >> c; if (y != 0) {n = n - c; x = y;}
c = c >> 1;
} while (c != 0);
return n - x;
}
int nlz3(int x) {
int y, n;
n = 0;
y = x;
L: if (x < 0) return n;
if (y == 0) return 32 - n;
n = n + 1;
x = x << 1;
y = y >> 1;
goto L;
}
int nlz4(unsigned x) {
int y, m, n;
y = -(x >> 16); // If left half of x is 0,
m = (y >> 16) & 16; // set n = 16. If left half
n = 16 - m; // is nonzero, set n = 0 and
x = x >> m; // shift x right 16.
// Now x is of the form 0000xxxx.
y = x - 0x100; // If positions 8-15 are 0,
m = (y >> 16) & 8; // add 8 to n and shift x left 8.
n = n + m;
x = x << m;
y = x - 0x1000; // If positions 12-15 are 0,
m = (y >> 16) & 4; // add 4 to n and shift x left 4.
n = n + m;
x = x << m;
y = x - 0x4000; // If positions 14-15 are 0,
m = (y >> 16) & 2; // add 2 to n and shift x left 2.
n = n + m;
x = x << m;
y = x >> 14; // Set y = 0, 1, 2, or 3.
m = y & ~(y >> 1); // Set m = 0, 1, 2, or 2 resp.
return n + 2 - m;
}
int nlz5(unsigned x) {
int pop(unsigned x);
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >>16);
return pop(~x);
}
/* The four programs below are not valid ANSI C programs. This is
because they refer to the same storage locations as two different types.
However, they work with xlc/AIX, gcc/AIX, and gcc/NT. If you try to
code them more compactly by declaring a variable xx to be "double," and
then using
n = 1054 - (*((unsigned *)&xx + LE) >> 20);
then you are violating not only the rule above, but also the ANSI C
rule that pointer arithmetic can be performed only on pointers to
array elements.
When coded with the above statement, the program fails with xlc,
gcc/AIX, and gcc/NT, at some optimization levels.
BTW, these programs use the "anonymous union" feature of C++, not
available in C. */
int nlz6(unsigned k) {
union {
unsigned asInt[2];
double asDouble;
};
int n;
asDouble = (double)k + 0.5;
n = 1054 - (asInt[LE] >> 20);
return n;
}
int nlz7(unsigned k) {
union {
unsigned asInt[2];
double asDouble;
};
int n;
asDouble = (double)k;
n = 1054 - (asInt[LE] >> 20);
n = (n & 31) + (n >> 9);
return n;
}
/* In single precision, round-to-nearest mode, the basic method fails for:
k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF,
07FFFFFC <= k <= 07FFFFFF,
0FFFFFF8 <= k <= 0FFFFFFF,
...
7FFFFFC0 <= k <= 7FFFFFFF.
FFFFFF80 <= k <= FFFFFFFF.
For k = 0 it gives 158, and for the other values it is too low by 1. */
int nlz8(unsigned k) {
union {
unsigned asInt;
float asFloat;
};
int n;
k = k & ~(k >> 1); /* Fix problem with rounding. */
asFloat = (float)k + 0.5f;
n = 158 - (asInt >> 23);
return n;
}
/* The example below shows how to make a macro for nlz. It uses an
extension to the C and C++ languages that is provided by the GNU C/C++
compiler, namely, that of allowing statements and declarations in
expressions (see "Using and Porting GNU CC", by Richard M. Stallman
(1998). The underscores are necessary to protect against the
possibility that the macro argument will conflict with one of its local
variables, e.g., NLZ(k). */
int nlz9(unsigned k) {
union {
unsigned asInt;
float asFloat;
};
int n;
k = k & ~(k >> 1); /* Fix problem with rounding. */
asFloat = (float)k;
n = 158 - (asInt >> 23);
n = (n & 31) + (n >> 6); /* Fix problem with k = 0. */
return n;
}
/* Below are three nearly equivalent programs for computing the number
of leading zeros in a word. This material is not in HD, but may be in a
future edition.
Immediately below is Robert Harley's algorithm, found at the
comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert
Juffa.
Table entries marked "u" are unused. 14 ops including a multiply,
plus an indexed load.
The smallest multiplier that works is 0x045BCED1 = 17*65*129*513 (all
of form 2**k + 1). There are no multipliers of three terms of the form
2**k +- 1 that work, with a table size of 64 or 128. There are some,
with a table size of 64, if you precede the multiplication with x = x -
(x >> 1), but that seems less elegant. There are also some if you use a
table size of 256, the smallest is 0x01033CBF = 65*255*1025 (this would
save two instructions in the form of this algorithm with the
multiplication expanded into shifts and adds, but the table size is
getting a bit large). */
#define u 99
int nlz10(unsigned x) {
static char table[64] =
{32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u,
u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u,
17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18,
5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u};
x = x | (x >> 1); // Propagate leftmost
x = x | (x >> 2); // 1-bit to the right.
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >>16);
x = x*0x06EB14F9; // Multiplier is 7*255**3.
return table[x >> 26];
}
/* Harley's algorithm with multiply expanded.
19 elementary ops plus an indexed load. */
int nlz10a(unsigned x) {
static char table[64] =
{32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u,
u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u,
17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18,
5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u};
x = x | (x >> 1); // Propagate leftmost
x = x | (x >> 2); // 1-bit to the right.
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
x = (x << 3) - x; // Multiply by 7.
x = (x << 8) - x; // Multiply by 255.
x = (x << 8) - x; // Again.
x = (x << 8) - x; // Again.
return table[x >> 26];
}
/* Julius Goryavsky's version of Harley's algorithm.
17 elementary ops plus an indexed load, if the machine
has "and not." */
int nlz10b(unsigned x) {
static char table[64] =
{32,20,19, u, u,18, u, 7, 10,17, u, u,14, u, 6, u,
u, 9, u,16, u, u, 1,26, u,13, u, u,24, 5, u, u,
u,21, u, 8,11, u,15, u, u, u, u, 2,27, 0,25, u,
22, u,12, u, u, 3,28, u, 23, u, 4,29, u, u,30,31};
x = x | (x >> 1); // Propagate leftmost
x = x | (x >> 2); // 1-bit to the right.
x = x | (x >> 4);
x = x | (x >> 8);
x = x & ~(x >> 16);
x = x*0xFD7049FF; // Activate this line or the following 3.
// x = (x << 9) - x; // Multiply by 511.
// x = (x << 11) - x; // Multiply by 2047.
// x = (x << 14) - x; // Multiply by 16383.
return table[x >> 26];
}
int errors;
void error(int x, int y) {
errors = errors + 1;
printf("Error for x = %08x, got %d\n", x, y);
}
int main()
{
# ifdef GLM_TEST_ENABLE_PERF
int i, n;
static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29,
7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23,
512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16,
65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11,
0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6,
0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3,
0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0,
0xFFFFFFFF,0};
std::size_t const Count = 10000000;
n = sizeof(test)/4;
std::clock_t TimestampBeg = 0;
std::clock_t TimestampEnd = 0;
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));}
TimestampEnd = std::clock();
printf("nlz1: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));}
TimestampEnd = std::clock();
printf("nlz1a: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));}
TimestampEnd = std::clock();
printf("nlz2: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));}
TimestampEnd = std::clock();
printf("nlz2a: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));}
TimestampEnd = std::clock();
printf("nlz3: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));}
TimestampEnd = std::clock();
printf("nlz4: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));}
TimestampEnd = std::clock();
printf("nlz5: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));}
TimestampEnd = std::clock();
printf("nlz6: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));}
TimestampEnd = std::clock();
printf("nlz7: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));}
TimestampEnd = std::clock();
printf("nlz8: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));}
TimestampEnd = std::clock();
printf("nlz9: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));}
TimestampEnd = std::clock();
printf("nlz10: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));}
TimestampEnd = std::clock();
printf("nlz10a: %d clocks\n", TimestampEnd - TimestampBeg);
TimestampBeg = std::clock();
for (std::size_t k = 0; k < Count; ++k)
for (i = 0; i < n; i += 2) {
if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));}
TimestampEnd = std::clock();
printf("nlz10b: %d clocks\n", TimestampEnd - TimestampBeg);
if (errors == 0)
printf("Passed all %d cases.\n", sizeof(test)/8);
# endif//GLM_TEST_ENABLE_PERF
}