Added SIMD integer operations optimizations

2016-05-30 15:38:47 +02:00 · 2016-05-30 15:38:47 +02:00 · 84caa1092f
commit 84caa1092f
parent 1bbc2935a2
2 changed files with 55 additions and 22 deletions
--- a/glm/detail/func_integer.inl
+++ b/glm/detail/func_integer.inl
@ -21,40 +21,36 @@ namespace detail
 		return Bits >= sizeof(T) * 8 ? ~static_cast<T>(0) : (static_cast<T>(1) << Bits) - static_cast<T>(1);
 	}

-	template <bool EXEC = false>
+	template <typename T, glm::precision P, template <typename, glm::precision> class vecType, bool EXEC = false>
 	struct compute_bitfieldReverseStep
 	{
-		template <typename T, glm::precision P, template <class, glm::precision> class vecType>
 		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & v, T, T)
 		{
 			return v;
 		}
 	};

-	template <>
-	struct compute_bitfieldReverseStep<true>
+	template <typename T, glm::precision P, template <typename, glm::precision> class vecType>
+	struct compute_bitfieldReverseStep<T, P, vecType, true>
 	{
-		template <typename T, glm::precision P, template <class, glm::precision> class vecType>
 		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & v, T Mask, T Shift)
 		{
 			return (v & Mask) << Shift | (v & (~Mask)) >> Shift;
 		}
 	};

-	template <bool EXEC = false>
+	template <typename T, glm::precision P, template <typename, glm::precision> class vecType, bool EXEC = false>
 	struct compute_bitfieldBitCountStep
 	{
-		template <typename T, glm::precision P, template <class, glm::precision> class vecType>
 		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & v, T, T)
 		{
 			return v;
 		}
 	};

-	template <>
-	struct compute_bitfieldBitCountStep<true>
+	template <typename T, glm::precision P, template <typename, glm::precision> class vecType>
+	struct compute_bitfieldBitCountStep<T, P, vecType, true>
 	{
-		template <typename T, glm::precision P, template <class, glm::precision> class vecType>
 		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & v, T Mask, T Shift)
 		{
 			return (v & Mask) + ((v >> Shift) & Mask);
@ -293,12 +289,12 @@ namespace detail
 	GLM_FUNC_QUALIFIER vecType<T, P> bitfieldReverse(vecType<T, P> const & v)
 	{
 		vecType<T, P> x(v);
-		x = detail::compute_bitfieldReverseStep<sizeof(T) * 8 >=  2>::call(x, T(0x5555555555555555ull), static_cast<T>( 1));
-		x = detail::compute_bitfieldReverseStep<sizeof(T) * 8 >=  4>::call(x, T(0x3333333333333333ull), static_cast<T>( 2));
-		x = detail::compute_bitfieldReverseStep<sizeof(T) * 8 >=  8>::call(x, T(0x0F0F0F0F0F0F0F0Full), static_cast<T>( 4));
-		x = detail::compute_bitfieldReverseStep<sizeof(T) * 8 >= 16>::call(x, T(0x00FF00FF00FF00FFull), static_cast<T>( 8));
-		x = detail::compute_bitfieldReverseStep<sizeof(T) * 8 >= 32>::call(x, T(0x0000FFFF0000FFFFull), static_cast<T>(16));
-		x = detail::compute_bitfieldReverseStep<sizeof(T) * 8 >= 64>::call(x, T(0x00000000FFFFFFFFull), static_cast<T>(32));
+		x = detail::compute_bitfieldReverseStep<T, P, vecType, sizeof(T) * 8 >=  2>::call(x, T(0x5555555555555555ull), static_cast<T>( 1));
+		x = detail::compute_bitfieldReverseStep<T, P, vecType, sizeof(T) * 8 >=  4>::call(x, T(0x3333333333333333ull), static_cast<T>( 2));
+		x = detail::compute_bitfieldReverseStep<T, P, vecType, sizeof(T) * 8 >=  8>::call(x, T(0x0F0F0F0F0F0F0F0Full), static_cast<T>( 4));
+		x = detail::compute_bitfieldReverseStep<T, P, vecType, sizeof(T) * 8 >= 16>::call(x, T(0x00FF00FF00FF00FFull), static_cast<T>( 8));
+		x = detail::compute_bitfieldReverseStep<T, P, vecType, sizeof(T) * 8 >= 32>::call(x, T(0x0000FFFF0000FFFFull), static_cast<T>(16));
+		x = detail::compute_bitfieldReverseStep<T, P, vecType, sizeof(T) * 8 >= 64>::call(x, T(0x00000000FFFFFFFFull), static_cast<T>(32));
 		return x;
 	}

@ -313,12 +309,12 @@ namespace detail
 	GLM_FUNC_QUALIFIER vecType<int, P> bitCount(vecType<T, P> const & v)
 	{
 		vecType<typename detail::make_unsigned<T>::type, P> x(*reinterpret_cast<vecType<typename detail::make_unsigned<T>::type, P> const *>(&v));
-		x = detail::compute_bitfieldBitCountStep<sizeof(T) * 8 >=  2>::call(x, typename detail::make_unsigned<T>::type(0x5555555555555555ull), typename detail::make_unsigned<T>::type( 1));
-		x = detail::compute_bitfieldBitCountStep<sizeof(T) * 8 >=  4>::call(x, typename detail::make_unsigned<T>::type(0x3333333333333333ull), typename detail::make_unsigned<T>::type( 2));
-		x = detail::compute_bitfieldBitCountStep<sizeof(T) * 8 >=  8>::call(x, typename detail::make_unsigned<T>::type(0x0F0F0F0F0F0F0F0Full), typename detail::make_unsigned<T>::type( 4));
-		x = detail::compute_bitfieldBitCountStep<sizeof(T) * 8 >= 16>::call(x, typename detail::make_unsigned<T>::type(0x00FF00FF00FF00FFull), typename detail::make_unsigned<T>::type( 8));
-		x = detail::compute_bitfieldBitCountStep<sizeof(T) * 8 >= 32>::call(x, typename detail::make_unsigned<T>::type(0x0000FFFF0000FFFFull), typename detail::make_unsigned<T>::type(16));
-		x = detail::compute_bitfieldBitCountStep<sizeof(T) * 8 >= 64>::call(x, typename detail::make_unsigned<T>::type(0x00000000FFFFFFFFull), typename detail::make_unsigned<T>::type(32));
+		x = detail::compute_bitfieldBitCountStep<T, P, vecType, sizeof(T) * 8 >=  2>::call(x, typename detail::make_unsigned<T>::type(0x5555555555555555ull), typename detail::make_unsigned<T>::type( 1));
+		x = detail::compute_bitfieldBitCountStep<T, P, vecType, sizeof(T) * 8 >=  4>::call(x, typename detail::make_unsigned<T>::type(0x3333333333333333ull), typename detail::make_unsigned<T>::type( 2));
+		x = detail::compute_bitfieldBitCountStep<T, P, vecType, sizeof(T) * 8 >=  8>::call(x, typename detail::make_unsigned<T>::type(0x0F0F0F0F0F0F0F0Full), typename detail::make_unsigned<T>::type( 4));
+		x = detail::compute_bitfieldBitCountStep<T, P, vecType, sizeof(T) * 8 >= 16>::call(x, typename detail::make_unsigned<T>::type(0x00FF00FF00FF00FFull), typename detail::make_unsigned<T>::type( 8));
+		x = detail::compute_bitfieldBitCountStep<T, P, vecType, sizeof(T) * 8 >= 32>::call(x, typename detail::make_unsigned<T>::type(0x0000FFFF0000FFFFull), typename detail::make_unsigned<T>::type(16));
+		x = detail::compute_bitfieldBitCountStep<T, P, vecType, sizeof(T) * 8 >= 64>::call(x, typename detail::make_unsigned<T>::type(0x00000000FFFFFFFFull), typename detail::make_unsigned<T>::type(32));
 		return vecType<int, P>(x);
 	}

--- a/glm/detail/func_integer_simd.inl
+++ b/glm/detail/func_integer_simd.inl
@ -8,7 +8,44 @@
 namespace glm{
 namespace detail
 {
+	template <glm::precision P>
+	struct compute_bitfieldReverseStep<uint32, P, tvec4, true>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<uint32, P> call(tvec4<uint32, P> const & v, uint32 Mask, uint32 Shift)
+		{
+			__m128i const set0 = v.data;

+			__m128i const set1 = _mm_set1_epi32(Mask);
+			__m128i const and1 = _mm_and_si128(set0, set1);
+			__m128i const sft1 = _mm_slli_epi32(and1, Shift);
+
+			__m128i const set2 = _mm_andnot_si128(set0, _mm_set1_epi32(-1));
+			__m128i const and2 = _mm_and_si128(set0, set2);
+			__m128i const sft2 = _mm_srai_epi32(and2, Shift);
+		
+			__m128i const or0 = _mm_or_si128(sft1, sft2);
+		
+			return or0;
+		}
+	};
+
+	template <glm::precision P>
+	struct compute_bitfieldBitCountStep<uint32, P, tvec4, true>
+	{
+		template <glm::precision P>
+		GLM_FUNC_QUALIFIER static tvec4<uint32, P> call(tvec4<uint32, P> const & v, uint32 Mask, uint32 Shift)
+		{
+			__m128i const set0 = v.data;
+
+			__m128i const set1 = _mm_set1_epi32(Mask);
+			__m128i const and0 = _mm_and_si128(set0, set1);
+			__m128i const sft0 = _mm_slli_epi32(set0, Shift);
+			__m128i const and1 = _mm_and_si128(sft0, set1);
+			__m128i const add0 = _mm_add_epi32(and0, and1);
+		
+			return add0;
+		}
+	};
 }//namespace detail

 #	if GLM_ARCH & GLM_ARCH_AVX_BIT