From c57a0007912bf18423d69eda62b9c5a5eec51d3f Mon Sep 17 00:00:00 2001 From: Christophe Riccio Date: Fri, 1 Feb 2013 23:55:25 +0100 Subject: [PATCH] Added bitfieldInterleave implementation --- glm/core/intrinsic_integer.hpp | 50 +++ glm/core/intrinsic_integer.inl | 140 +++++++++ glm/gtx/bit.inl | 117 ++++++- readme.txt | 1 + test/gtx/gtx_bit.cpp | 555 +++++++++++++-------------------- 5 files changed, 524 insertions(+), 339 deletions(-) create mode 100644 glm/core/intrinsic_integer.hpp create mode 100644 glm/core/intrinsic_integer.inl diff --git a/glm/core/intrinsic_integer.hpp b/glm/core/intrinsic_integer.hpp new file mode 100644 index 00000000..7bc04016 --- /dev/null +++ b/glm/core/intrinsic_integer.hpp @@ -0,0 +1,50 @@ +/////////////////////////////////////////////////////////////////////////////////// +/// OpenGL Mathematics (glm.g-truc.net) +/// +/// Copyright (c) 2005 - 2012 G-Truc Creation (www.g-truc.net) +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in +/// all copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +/// THE SOFTWARE. +/// +/// @ref core +/// @file glm/core/intrinsic_integer.hpp +/// @date 2009-05-11 / 2011-06-15 +/// @author Christophe Riccio +/////////////////////////////////////////////////////////////////////////////////// + +#ifndef glm_detail_intrinsic_integer +#define glm_detail_intrinsic_integer + +#include "setup.hpp" + +#if(!(GLM_ARCH & GLM_ARCH_SSE2)) +# error "SSE2 instructions not supported or enabled" +#else + +namespace glm{ +namespace detail +{ + __m128i _mm_bit_interleave_si128(__m128i x) + __m128i _mm_bit_interleave_si128(__m128i x, __m128i y); + +}//namespace detail +}//namespace glm + +#include "intrinsic_integer.inl" + +#endif//GLM_ARCH +#endif//glm_detail_intrinsic_integer diff --git a/glm/core/intrinsic_integer.inl b/glm/core/intrinsic_integer.inl new file mode 100644 index 00000000..047fe9af --- /dev/null +++ b/glm/core/intrinsic_integer.inl @@ -0,0 +1,140 @@ +/////////////////////////////////////////////////////////////////////////////////// +/// OpenGL Mathematics (glm.g-truc.net) +/// +/// Copyright (c) 2005 - 2012 G-Truc Creation (www.g-truc.net) +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in +/// all copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +/// THE SOFTWARE. +/// +/// @ref core +/// @file glm/core/intrinsic_integer.inl +/// @date 2009-05-08 / 2011-06-15 +/// @author Christophe Riccio +/////////////////////////////////////////////////////////////////////////////////// + +namespace glm{ +namespace detail +{ + inline __m128i _mm_bit_interleave_si128(__m128i x) + { + __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); + __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); + __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); + __m128i const Mask1 = _mm_set1_epi32(0x33333333); + __m128i const Mask0 = _mm_set1_epi32(0x55555555); + + __m128i Reg1; + __m128i Reg2; + + // REG1 = x; + // REG2 = y; + //Reg1 = _mm_unpacklo_epi64(x, y); + Reg1 = x; + + //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); + //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); + Reg2 = _mm_slli_si128(Reg1, 2); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask4); + + //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); + //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); + Reg2 = _mm_slli_si128(Reg1, 1); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask3); + + //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); + //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); + Reg2 = _mm_slli_epi32(Reg1, 4); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask2); + + //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); + //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); + Reg2 = _mm_slli_epi32(Reg1, 2); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask1); + + //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); + //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); + Reg2 = _mm_slli_epi32(Reg1, 1); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask0); + + //return REG1 | (REG2 << 1); + Reg2 = _mm_slli_epi32(Reg1, 1); + Reg2 = _mm_srli_si128(Reg2, 8); + Reg1 = _mm_or_si128(Reg1, Reg2); + + return Reg1; + } + + inline __m128i _mm_bit_interleave_si128(__m128i x, __m128i y) + { + __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); + __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); + __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); + __m128i const Mask1 = _mm_set1_epi32(0x33333333); + __m128i const Mask0 = _mm_set1_epi32(0x55555555); + + __m128i Reg1; + __m128i Reg2; + + // REG1 = x; + // REG2 = y; + Reg1 = _mm_unpacklo_epi64(x, y); + + //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); + //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); + Reg2 = _mm_slli_si128(Reg1, 2); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask4); + + //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); + //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); + Reg2 = _mm_slli_si128(Reg1, 1); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask3); + + //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); + //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); + Reg2 = _mm_slli_epi32(Reg1, 4); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask2); + + //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); + //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); + Reg2 = _mm_slli_epi32(Reg1, 2); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask1); + + //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); + //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); + Reg2 = _mm_slli_epi32(Reg1, 1); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask0); + + //return REG1 | (REG2 << 1); + Reg2 = _mm_slli_epi32(Reg1, 1); + Reg2 = _mm_srli_si128(Reg2, 8); + Reg1 = _mm_or_si128(Reg1, Reg2); + + return Reg1; + } + +}//namespace detail +}//namespace glms diff --git a/glm/gtx/bit.inl b/glm/gtx/bit.inl index 7e4890ca..825a20a8 100644 --- a/glm/gtx/bit.inl +++ b/glm/gtx/bit.inl @@ -608,11 +608,90 @@ namespace glm Result |= (x & 1U << i) << i | (y & 1U << i) << (i + 1); return Result; } + + template <> + inline glm::uint16 bitfieldInterleave(glm::uint8 x, glm::uint8 y) + { + glm::uint16 REG1(x); + glm::uint16 REG2(y); + + REG1 = ((REG1 << 4) | REG1) & glm::uint16(0x0F0F); + REG2 = ((REG2 << 4) | REG2) & glm::uint16(0x0F0F); + + REG1 = ((REG1 << 2) | REG1) & glm::uint16(0x3333); + REG2 = ((REG2 << 2) | REG2) & glm::uint16(0x3333); + + REG1 = ((REG1 << 1) | REG1) & glm::uint16(0x5555); + REG2 = ((REG2 << 1) | REG2) & glm::uint16(0x5555); + + return REG1 | (REG2 << 1); + } + + template <> + inline glm::uint32 bitfieldInterleave(glm::uint16 x, glm::uint16 y) + { + glm::uint32 REG1(x); + glm::uint32 REG2(y); + + REG1 = ((REG1 << 8) | REG1) & glm::uint32(0x00FF00FF); + REG2 = ((REG2 << 8) | REG2) & glm::uint32(0x00FF00FF); + + REG1 = ((REG1 << 4) | REG1) & glm::uint32(0x0F0F0F0F); + REG2 = ((REG2 << 4) | REG2) & glm::uint32(0x0F0F0F0F); + + REG1 = ((REG1 << 2) | REG1) & glm::uint32(0x33333333); + REG2 = ((REG2 << 2) | REG2) & glm::uint32(0x33333333); + + REG1 = ((REG1 << 1) | REG1) & glm::uint32(0x55555555); + REG2 = ((REG2 << 1) | REG2) & glm::uint32(0x55555555); + + return REG1 | (REG2 << 1); + } + + template <> + inline glm::uint64 bitfieldInterleave(glm::uint32 x, glm::uint32 y) + { + glm::uint64 REG1(x); + glm::uint64 REG2(y); + + REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); + REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); + + REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); + REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); + + REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); + REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); + + REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); + REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); + + REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); + REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); + + return REG1 | (REG2 << 1); + } }//namespace detail inline int16 bitfieldInterleave(int8 x, int8 y) { - return detail::bitfieldInterleave(x, y); + union sign8 + { + int8 i; + uint8 u; + } sign_x, sign_y; + + union sign16 + { + int16 i; + uint16 u; + } result; + + sign_x.i = x; + sign_y.i = y; + result.u = detail::bitfieldInterleave(sign_x.u, sign_y.u); + + return result.i; } inline uint16 bitfieldInterleave(uint8 x, uint8 y) @@ -622,7 +701,23 @@ namespace glm inline int32 bitfieldInterleave(int16 x, int16 y) { - return detail::bitfieldInterleave(x, y); + union sign16 + { + int16 i; + uint16 u; + } sign_x, sign_y; + + union sign32 + { + int32 i; + uint32 u; + } result; + + sign_x.i = x; + sign_y.i = y; + result.u = detail::bitfieldInterleave(sign_x.u, sign_y.u); + + return result.i; } inline uint32 bitfieldInterleave(uint16 x, uint16 y) @@ -632,7 +727,23 @@ namespace glm inline int64 bitfieldInterleave(int32 x, int32 y) { - return detail::bitfieldInterleave(x, y); + union sign32 + { + int32 i; + uint32 u; + } sign_x, sign_y; + + union sign64 + { + int64 i; + uint64 u; + } result; + + sign_x.i = x; + sign_y.i = y; + result.u = detail::bitfieldInterleave(sign_x.u, sign_y.u); + + return result.i; } inline uint64 bitfieldInterleave(uint32 x, uint32 y) diff --git a/readme.txt b/readme.txt index e0f90134..bee9e00d 100644 --- a/readme.txt +++ b/readme.txt @@ -40,6 +40,7 @@ http://glm.g-truc.net/glm.pdf GLM 0.9.5.X: 2013-XX-XX -------------------------------------------------------------------------------- - Improved Intel Compiler detection +- Added bitfieldInterleave and _mm_bit_interleave_si128 functions ================================================================================ GLM 0.9.4.2: 2013-01-XX diff --git a/test/gtx/gtx_bit.cpp b/test/gtx/gtx_bit.cpp index 35e7f6da..205ec704 100644 --- a/test/gtx/gtx_bit.cpp +++ b/test/gtx/gtx_bit.cpp @@ -10,6 +10,11 @@ #include #include #include + +#if(GLM_ARCH != GLM_ARCH_PURE) +# include +#endif + #include #include #include @@ -166,333 +171,194 @@ namespace bitRevert } }//bitRevert -inline glm::uint64 fastBitfieldInterleave(glm::uint32 x, glm::uint32 y) -{ - glm::uint64 REG1; - glm::uint64 REG2; - - REG1 = x; - REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); - REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); - REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); - REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); - REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); - - REG2 = y; - REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); - REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); - REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); - REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); - REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); - - return REG1 | (REG2 << 1); -} - -inline glm::uint64 interleaveBitfieldInterleave(glm::uint32 x, glm::uint32 y) -{ - glm::uint64 REG1; - glm::uint64 REG2; - - REG1 = x; - REG2 = y; - - REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); - REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); - - REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); - REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); - - REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); - REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); - - REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); - REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); - - REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); - REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); - - return REG1 | (REG2 << 1); -} - -inline glm::uint64 loopBitfieldInterleave(glm::uint32 x, glm::uint32 y) -{ - static glm::uint64 const Mask[5] = - { - 0x5555555555555555, - 0x3333333333333333, - 0x0F0F0F0F0F0F0F0F, - 0x00FF00FF00FF00FF, - 0x0000FFFF0000FFFF - }; - - glm::uint64 REG1 = x; - glm::uint64 REG2 = y; - for(int i = 4; i >= 0; --i) - { - REG1 = ((REG1 << (1 << i)) | REG1) & Mask[i]; - REG2 = ((REG2 << (1 << i)) | REG2) & Mask[i]; - } - - return REG1 | (REG2 << 1); -} - -/* -const int N = 1024; - -int32_t b1[N]; // 2 x arrays of input bit sets -int32_t b2[N]; -int32_t b3[N]; // 1 x array of output bit sets - -for (int i = 0; i < N; i += 4) -{ - __m128i v1 = _mm_loadu_si128(&b1[i]); // load input bits sets - __m128i v2 = _mm_loadu_si128(&b2[i]); - __m128i v3 = _mm_and_si128(v1, v2); // do the bitwise AND - _mm_storeu_si128(&b3[i], v3); // store the result -} -If you just want to AND an array in-place with a fixed mask then it would simplify to this: - -const int N = 1024; - -int32_t b1[N]; // input/output array of bit sets - -const __m128i v2 = _mm_set1_epi32(0x12345678); // mask - -for (int i = 0; i < N; i += 4) -{ - __m128i v1 = _mm_loadu_si128(&b1[i]); // load input bits sets - __m128i v3 = _mm_and_si128(v1, v2); // do the bitwise AND - _mm_storeu_si128(&b1[i], v3); // store the result -} -Note: for better performance make sure your input/output arrays are 16 byte aligned and then use _mm_load_si128/_mm_store_si128 rather than their unaligned counterparts as above. -*/ - -inline glm::uint64 sseBitfieldInterleave(glm::uint32 x, glm::uint32 y) -{ - GLM_ALIGN(16) glm::uint32 const Array[4] = {x, 0, y, 0}; - - __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); - __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); - __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); - __m128i const Mask1 = _mm_set1_epi32(0x33333333); - __m128i const Mask0 = _mm_set1_epi32(0x55555555); - - __m128i Reg1; - __m128i Reg2; - - // REG1 = x; - // REG2 = y; - Reg1 = _mm_load_si128((__m128i*)Array); - - //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); - //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); - Reg2 = _mm_slli_si128(Reg1, 2); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask4); - - //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); - //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); - Reg2 = _mm_slli_si128(Reg1, 1); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask3); - - //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); - //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); - Reg2 = _mm_slli_epi32(Reg1, 4); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask2); - - //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); - //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); - Reg2 = _mm_slli_epi32(Reg1, 2); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask1); - - //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); - //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); - Reg2 = _mm_slli_epi32(Reg1, 1); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask0); - - //return REG1 | (REG2 << 1); - Reg2 = _mm_slli_epi32(Reg1, 1); - Reg2 = _mm_srli_si128(Reg2, 8); - Reg1 = _mm_or_si128(Reg1, Reg2); - - GLM_ALIGN(16) glm::uint64 Result[2]; - _mm_store_si128((__m128i*)Result, Reg1); - - return Result[0]; -} - -inline glm::uint64 sseUnalignedBitfieldInterleave(glm::uint32 x, glm::uint32 y) -{ - glm::uint32 const Array[4] = {x, 0, y, 0}; - - __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); - __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); - __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); - __m128i const Mask1 = _mm_set1_epi32(0x33333333); - __m128i const Mask0 = _mm_set1_epi32(0x55555555); - - __m128i Reg1; - __m128i Reg2; - - // REG1 = x; - // REG2 = y; - Reg1 = _mm_loadu_si128((__m128i*)Array); - - //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); - //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); - Reg2 = _mm_slli_si128(Reg1, 2); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask4); - - //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); - //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); - Reg2 = _mm_slli_si128(Reg1, 1); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask3); - - //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); - //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); - Reg2 = _mm_slli_epi32(Reg1, 4); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask2); - - //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); - //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); - Reg2 = _mm_slli_epi32(Reg1, 2); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask1); - - //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); - //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); - Reg2 = _mm_slli_epi32(Reg1, 1); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask0); - - //return REG1 | (REG2 << 1); - Reg2 = _mm_slli_epi32(Reg1, 1); - Reg2 = _mm_srli_si128(Reg2, 8); - Reg1 = _mm_or_si128(Reg1, Reg2); - - glm::uint64 Result[2]; - _mm_storeu_si128((__m128i*)Result, Reg1); - - return Result[0]; -} - -inline __m128i _mm_bit_interleave_si128(__m128i x, __m128i y) -{ - __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); - __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); - __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); - __m128i const Mask1 = _mm_set1_epi32(0x33333333); - __m128i const Mask0 = _mm_set1_epi32(0x55555555); - - __m128i Reg1; - __m128i Reg2; - - // REG1 = x; - // REG2 = y; - Reg1 = _mm_unpacklo_epi64(x, y); - - //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); - //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); - Reg2 = _mm_slli_si128(Reg1, 2); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask4); - - //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); - //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); - Reg2 = _mm_slli_si128(Reg1, 1); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask3); - - //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); - //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); - Reg2 = _mm_slli_epi32(Reg1, 4); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask2); - - //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); - //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); - Reg2 = _mm_slli_epi32(Reg1, 2); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask1); - - //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); - //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); - Reg2 = _mm_slli_epi32(Reg1, 1); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask0); - - //return REG1 | (REG2 << 1); - Reg2 = _mm_slli_epi32(Reg1, 1); - Reg2 = _mm_srli_si128(Reg2, 8); - Reg1 = _mm_or_si128(Reg1, Reg2); - - return Reg1; -} - - -inline __m128i _mm_bit_interleave_si128(__m128i x) -{ - __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); - __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); - __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); - __m128i const Mask1 = _mm_set1_epi32(0x33333333); - __m128i const Mask0 = _mm_set1_epi32(0x55555555); - - __m128i Reg1; - __m128i Reg2; - - // REG1 = x; - // REG2 = y; - //Reg1 = _mm_unpacklo_epi64(x, y); - Reg1 = x; - - //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); - //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); - Reg2 = _mm_slli_si128(Reg1, 2); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask4); - - //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); - //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); - Reg2 = _mm_slli_si128(Reg1, 1); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask3); - - //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); - //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); - Reg2 = _mm_slli_epi32(Reg1, 4); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask2); - - //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); - //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); - Reg2 = _mm_slli_epi32(Reg1, 2); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask1); - - //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); - //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); - Reg2 = _mm_slli_epi32(Reg1, 1); - Reg1 = _mm_or_si128(Reg2, Reg1); - Reg1 = _mm_and_si128(Reg1, Mask0); - - //return REG1 | (REG2 << 1); - Reg2 = _mm_slli_epi32(Reg1, 1); - Reg2 = _mm_srli_si128(Reg2, 8); - Reg1 = _mm_or_si128(Reg1, Reg2); - - return Reg1; -} - namespace bitfieldInterleave { + inline glm::uint64 fastBitfieldInterleave(glm::uint32 x, glm::uint32 y) + { + glm::uint64 REG1; + glm::uint64 REG2; + + REG1 = x; + REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); + REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); + REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); + REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); + REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); + + REG2 = y; + REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); + REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); + REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); + REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); + REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); + + return REG1 | (REG2 << 1); + } + + inline glm::uint64 interleaveBitfieldInterleave(glm::uint32 x, glm::uint32 y) + { + glm::uint64 REG1; + glm::uint64 REG2; + + REG1 = x; + REG2 = y; + + REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); + REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); + + REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); + REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); + + REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); + REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); + + REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); + REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); + + REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); + REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); + + return REG1 | (REG2 << 1); + } + + inline glm::uint64 loopBitfieldInterleave(glm::uint32 x, glm::uint32 y) + { + static glm::uint64 const Mask[5] = + { + 0x5555555555555555, + 0x3333333333333333, + 0x0F0F0F0F0F0F0F0F, + 0x00FF00FF00FF00FF, + 0x0000FFFF0000FFFF + }; + + glm::uint64 REG1 = x; + glm::uint64 REG2 = y; + for(int i = 4; i >= 0; --i) + { + REG1 = ((REG1 << (1 << i)) | REG1) & Mask[i]; + REG2 = ((REG2 << (1 << i)) | REG2) & Mask[i]; + } + + return REG1 | (REG2 << 1); + } + + inline glm::uint64 sseBitfieldInterleave(glm::uint32 x, glm::uint32 y) + { + GLM_ALIGN(16) glm::uint32 const Array[4] = {x, 0, y, 0}; + + __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); + __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); + __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); + __m128i const Mask1 = _mm_set1_epi32(0x33333333); + __m128i const Mask0 = _mm_set1_epi32(0x55555555); + + __m128i Reg1; + __m128i Reg2; + + // REG1 = x; + // REG2 = y; + Reg1 = _mm_load_si128((__m128i*)Array); + + //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); + //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); + Reg2 = _mm_slli_si128(Reg1, 2); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask4); + + //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); + //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); + Reg2 = _mm_slli_si128(Reg1, 1); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask3); + + //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); + //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); + Reg2 = _mm_slli_epi32(Reg1, 4); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask2); + + //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); + //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); + Reg2 = _mm_slli_epi32(Reg1, 2); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask1); + + //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); + //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); + Reg2 = _mm_slli_epi32(Reg1, 1); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask0); + + //return REG1 | (REG2 << 1); + Reg2 = _mm_slli_epi32(Reg1, 1); + Reg2 = _mm_srli_si128(Reg2, 8); + Reg1 = _mm_or_si128(Reg1, Reg2); + + GLM_ALIGN(16) glm::uint64 Result[2]; + _mm_store_si128((__m128i*)Result, Reg1); + + return Result[0]; + } + + inline glm::uint64 sseUnalignedBitfieldInterleave(glm::uint32 x, glm::uint32 y) + { + glm::uint32 const Array[4] = {x, 0, y, 0}; + + __m128i const Mask4 = _mm_set1_epi32(0x0000FFFF); + __m128i const Mask3 = _mm_set1_epi32(0x00FF00FF); + __m128i const Mask2 = _mm_set1_epi32(0x0F0F0F0F); + __m128i const Mask1 = _mm_set1_epi32(0x33333333); + __m128i const Mask0 = _mm_set1_epi32(0x55555555); + + __m128i Reg1; + __m128i Reg2; + + // REG1 = x; + // REG2 = y; + Reg1 = _mm_loadu_si128((__m128i*)Array); + + //REG1 = ((REG1 << 16) | REG1) & glm::uint64(0x0000FFFF0000FFFF); + //REG2 = ((REG2 << 16) | REG2) & glm::uint64(0x0000FFFF0000FFFF); + Reg2 = _mm_slli_si128(Reg1, 2); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask4); + + //REG1 = ((REG1 << 8) | REG1) & glm::uint64(0x00FF00FF00FF00FF); + //REG2 = ((REG2 << 8) | REG2) & glm::uint64(0x00FF00FF00FF00FF); + Reg2 = _mm_slli_si128(Reg1, 1); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask3); + + //REG1 = ((REG1 << 4) | REG1) & glm::uint64(0x0F0F0F0F0F0F0F0F); + //REG2 = ((REG2 << 4) | REG2) & glm::uint64(0x0F0F0F0F0F0F0F0F); + Reg2 = _mm_slli_epi32(Reg1, 4); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask2); + + //REG1 = ((REG1 << 2) | REG1) & glm::uint64(0x3333333333333333); + //REG2 = ((REG2 << 2) | REG2) & glm::uint64(0x3333333333333333); + Reg2 = _mm_slli_epi32(Reg1, 2); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask1); + + //REG1 = ((REG1 << 1) | REG1) & glm::uint64(0x5555555555555555); + //REG2 = ((REG2 << 1) | REG2) & glm::uint64(0x5555555555555555); + Reg2 = _mm_slli_epi32(Reg1, 1); + Reg1 = _mm_or_si128(Reg2, Reg1); + Reg1 = _mm_and_si128(Reg1, Mask0); + + //return REG1 | (REG2 << 1); + Reg2 = _mm_slli_epi32(Reg1, 1); + Reg2 = _mm_srli_si128(Reg2, 8); + Reg1 = _mm_or_si128(Reg1, Reg2); + + glm::uint64 Result[2]; + _mm_storeu_si128((__m128i*)Result, Reg1); + + return Result[0]; + } + int test() { glm::uint32 x_max = 1 << 13; @@ -514,11 +380,36 @@ namespace bitfieldInterleave glm::uint64 D = interleaveBitfieldInterleave(x, y); glm::uint64 E = sseBitfieldInterleave(x, y); glm::uint64 F = sseUnalignedBitfieldInterleave(x, y); + assert(A == B); assert(A == C); assert(A == D); assert(A == E); assert(A == F); + +# if(GLM_ARCH != GLM_ARCH_PURE) + __m128i G = _mm_bit_interleave_si128(_mm_set_epi32(0, y, 0, x)); + glm::uint64 Result[2]; + _mm_storeu_si128((__m128i*)Result, G); + assert(A == Result[0]); +# endif//(GLM_ARCH != GLM_ARCH_PURE) + } + } + + { + for(glm::uint8 y = 0; y < 127; ++y) + for(glm::uint8 x = 0; x < 127; ++x) + { + glm::uint64 A(glm::bitfieldInterleave(glm::uint8(x), glm::uint8(y))); + glm::uint64 B(glm::bitfieldInterleave(glm::uint16(x), glm::uint16(y))); + glm::uint64 C(glm::bitfieldInterleave(glm::uint32(x), glm::uint32(y))); + + glm::int64 D(glm::bitfieldInterleave(glm::int8(x), glm::int8(y))); + glm::int64 E(glm::bitfieldInterleave(glm::int16(x), glm::int16(y))); + glm::int64 F(glm::bitfieldInterleave(glm::int32(x), glm::int32(y))); + + assert(D == E); + assert(D == F); } } @@ -588,6 +479,7 @@ namespace bitfieldInterleave std::cout << "sseUnalignedBitfieldInterleave Time " << Time << " clocks" << std::endl; } +# if(GLM_ARCH != GLM_ARCH_PURE) { // SIMD glm::int32 simd_x_max = 1 << 13; @@ -601,14 +493,13 @@ namespace bitfieldInterleave std::clock_t LastTime = std::clock(); for(std::size_t i = 0; i < Data.size(); ++i) - SimdData[i] = _mm_bit_interleave_si128(SimdParam[i]); + SimdData[i] = glm::detail::_mm_bit_interleave_si128(SimdParam[i]); std::clock_t Time = std::clock() - LastTime; std::cout << "_mm_bit_interleave_si128 Time " << Time << " clocks" << std::endl; } - - +# endif//(GLM_ARCH != GLM_ARCH_PURE) return 0; } @@ -616,18 +507,10 @@ namespace bitfieldInterleave int main() { - //__m64 REG3 = _mm_set1_pi32(static_cast(0x80000000)); - //__m64 REG1 = _mm_set1_pi32(0xFFFFFFFF); - //__m64 REG2 = _mm_set1_pi32(0x55555555); - //__m128i REG = _mm_set_epi64(REG1, REG2); - - int Error = 0; Error += ::bitfieldInterleave::test(); Error += ::extractField::test(); Error += ::bitRevert::test(); - while(true); - return Error; }