diff --git a/Extensions/DirectXMathAVX.h b/Extensions/DirectXMathAVX.h index ee891d8..d4ae467 100644 --- a/Extensions/DirectXMathAVX.h +++ b/Extensions/DirectXMathAVX.h @@ -1,289 +1,289 @@ -//------------------------------------------------------------------------------------- -// DirectXMathAVX.h -- AVX (version 1) extensions for SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#ifdef _MSC_VER -#pragma once -#endif - -#ifdef _M_ARM -#error AVX not supported on ARM platform -#endif - -#if defined(_MSC_VER) && (_MSC_VER < 1600) -#error AVX intrinsics requires Visual C++ 2010 Service Pack 1 or later. -#endif - -#pragma warning(push) -#pragma warning(disable : 4987) -#include -#pragma warning(pop) - -#include - -#include - -namespace DirectX -{ -#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) -#define XM_CALLCONV __fastcall -typedef const DirectX::XMVECTOR& HXMVECTOR; -typedef const DirectX::XMMATRIX& FXMMATRIX; -#endif - -namespace AVX -{ - -inline bool XMVerifyAVXSupport() -{ - // Should return true for AMD Bulldozer, Intel "Sandy Bridge", and Intel "Ivy Bridge" or later processors - // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) - - // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx - int CPUInfo[4] = {-1}; - __cpuid( CPUInfo, 0 ); - - if ( CPUInfo[0] < 1 ) - return false; - - __cpuid(CPUInfo, 1 ); - - // We check for AVX, OSXSAVE, SSSE4.1, and SSE3 - return ( (CPUInfo[2] & 0x18080001) == 0x18080001 ); -} - - -//------------------------------------------------------------------------------------- -// Vector -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr( _In_ const float *pValue ) -{ - return _mm_broadcast_ss( pValue ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSplatX( FXMVECTOR V ) -{ - return _mm_permute_ps( V, _MM_SHUFFLE(0, 0, 0, 0) ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSplatY( FXMVECTOR V ) -{ - return _mm_permute_ps( V, _MM_SHUFFLE(1, 1, 1, 1) ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSplatZ( FXMVECTOR V ) -{ - return _mm_permute_ps( V, _MM_SHUFFLE(2, 2, 2, 2) ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSplatW( FXMVECTOR V ) -{ - return _mm_permute_ps( V, _MM_SHUFFLE(3, 3, 3, 3) ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSwizzle( FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3 ) -{ - assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); - _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); - - unsigned int elem[4] = { E0, E1, E2, E3 }; - __m128i vControl = _mm_loadu_si128( reinterpret_cast(&elem[0]) ); - return _mm_permutevar_ps( V, vControl ); -} - -inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW ) -{ - assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); - _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); - - static const XMVECTORU32 three = { 3, 3, 3, 3 }; - - _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW }; - __m128i vControl = _mm_load_si128( reinterpret_cast(&elem[0]) ); - - __m128i vSelect = _mm_cmpgt_epi32( vControl, three ); - vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) ); - - __m128 shuffled1 = _mm_permutevar_ps( V1, vControl ); - __m128 shuffled2 = _mm_permutevar_ps( V2, vControl ); - - __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 ); - __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 ); - - return _mm_or_ps( masked1, masked2 ); -} - -inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) -{ - assert( Elements < 4 ); - _Analysis_assume_( Elements < 4 ); - return AVX::XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3)); -} - -inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) -{ - assert( Elements < 4 ); - _Analysis_assume_( Elements < 4 ); - return AVX::XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 ); -} - -inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) -{ - assert( Elements < 4 ); - _Analysis_assume_( Elements < 4 ); - return AVX::XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 ); -} - - -//------------------------------------------------------------------------------------- -// Permute Templates -//------------------------------------------------------------------------------------- - -namespace Internal -{ - // Slow path fallback for permutes that do not map to a single SSE opcode. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) - { - static const XMVECTORU32 selectMask = - { - WhichX ? 0xFFFFFFFF : 0, - WhichY ? 0xFFFFFFFF : 0, - WhichZ ? 0xFFFFFFFF : 0, - WhichW ? 0xFFFFFFFF : 0, - }; - - XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle); - XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle); - - XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); - XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); - - return _mm_or_ps(masked1, masked2); - } - }; - - // Fast path for permutes that only read from the first vector. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); } - }; - - // Fast path for permutes that only read from the second vector. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_permute_ps(v2, Shuffle); } - }; - - // Fast path for permutes that read XY from the first vector, ZW from the second. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); } - }; - - // Fast path for permutes that read XY from the second vector, ZW from the first. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); } - }; -}; - -// General permute template -template - inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) -{ - static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); - static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); - static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); - static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); - - const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); - - const bool WhichX = PermuteX > 3; - const bool WhichY = PermuteY > 3; - const bool WhichZ = PermuteZ > 3; - const bool WhichW = PermuteW > 3; - - return AVX::Internal::PermuteHelper::Permute(V1, V2); -} - -// Special-case permute templates -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); } - - -//------------------------------------------------------------------------------------- -// Swizzle Templates -//------------------------------------------------------------------------------------- - -// General swizzle template -template - inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) -{ - static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); - static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); - static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); - static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); - - return _mm_permute_ps( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) ); -} - -// Specialized swizzles -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); } - - -//------------------------------------------------------------------------------------- -// Other Templates -//------------------------------------------------------------------------------------- - -template - inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) -{ - static_assert( Elements < 4, "Elements template parameter out of range" ); - return AVX::XMVectorPermute(V1, V2); -} - -template - inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) -{ - static_assert( Elements < 4, "Elements template parameter out of range" ); - return AVX::XMVectorSwizzle(V); -} - -template - inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) -{ - static_assert( Elements < 4, "Elements template parameter out of range" ); - return AVX::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); -} - -}; // namespace AVX - -}; // namespace DirectX; +//------------------------------------------------------------------------------------- +// DirectXMathAVX.h -- AVX (version 1) extensions for SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#ifdef _M_ARM +#error AVX not supported on ARM platform +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1600) +#error AVX intrinsics requires Visual C++ 2010 Service Pack 1 or later. +#endif + +#pragma warning(push) +#pragma warning(disable : 4987) +#include +#pragma warning(pop) + +#include + +#include + +namespace DirectX +{ +#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) +#define XM_CALLCONV __fastcall +typedef const DirectX::XMVECTOR& HXMVECTOR; +typedef const DirectX::XMMATRIX& FXMMATRIX; +#endif + +namespace AVX +{ + +inline bool XMVerifyAVXSupport() +{ + // Should return true for AMD Bulldozer, Intel "Sandy Bridge", and Intel "Ivy Bridge" or later processors + // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = {-1}; + __cpuid( CPUInfo, 0 ); + + if ( CPUInfo[0] < 1 ) + return false; + + __cpuid(CPUInfo, 1 ); + + // We check for AVX, OSXSAVE, SSSE4.1, and SSE3 + return ( (CPUInfo[2] & 0x18080001) == 0x18080001 ); +} + + +//------------------------------------------------------------------------------------- +// Vector +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr( _In_ const float *pValue ) +{ + return _mm_broadcast_ss( pValue ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatX( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(0, 0, 0, 0) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatY( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(1, 1, 1, 1) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatZ( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(2, 2, 2, 2) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatW( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(3, 3, 3, 3) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle( FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3 ) +{ + assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); + _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); + + unsigned int elem[4] = { E0, E1, E2, E3 }; + __m128i vControl = _mm_loadu_si128( reinterpret_cast(&elem[0]) ); + return _mm_permutevar_ps( V, vControl ); +} + +inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW ) +{ + assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + + static const XMVECTORU32 three = { 3, 3, 3, 3 }; + + _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW }; + __m128i vControl = _mm_load_si128( reinterpret_cast(&elem[0]) ); + + __m128i vSelect = _mm_cmpgt_epi32( vControl, three ); + vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) ); + + __m128 shuffled1 = _mm_permutevar_ps( V1, vControl ); + __m128 shuffled2 = _mm_permutevar_ps( V2, vControl ); + + __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 ); + __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 ); + + return _mm_or_ps( masked1, masked2 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return AVX::XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3)); +} + +inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return AVX::XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return AVX::XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 ); +} + + +//------------------------------------------------------------------------------------- +// Permute Templates +//------------------------------------------------------------------------------------- + +namespace Internal +{ + // Slow path fallback for permutes that do not map to a single SSE opcode. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) + { + static const XMVECTORU32 selectMask = + { + WhichX ? 0xFFFFFFFF : 0, + WhichY ? 0xFFFFFFFF : 0, + WhichZ ? 0xFFFFFFFF : 0, + WhichW ? 0xFFFFFFFF : 0, + }; + + XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle); + XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle); + + XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); + XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); + + return _mm_or_ps(masked1, masked2); + } + }; + + // Fast path for permutes that only read from the first vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); } + }; + + // Fast path for permutes that only read from the second vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_permute_ps(v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the first vector, ZW from the second. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the second vector, ZW from the first. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); } + }; +}; + +// General permute template +template + inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); + static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); + static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); + static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); + + const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); + + const bool WhichX = PermuteX > 3; + const bool WhichY = PermuteY > 3; + const bool WhichZ = PermuteZ > 3; + const bool WhichW = PermuteW > 3; + + return AVX::Internal::PermuteHelper::Permute(V1, V2); +} + +// Special-case permute templates +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); } + + +//------------------------------------------------------------------------------------- +// Swizzle Templates +//------------------------------------------------------------------------------------- + +// General swizzle template +template + inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) +{ + static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); + static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); + static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); + static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); + + return _mm_permute_ps( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) ); +} + +// Specialized swizzles +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); } + + +//------------------------------------------------------------------------------------- +// Other Templates +//------------------------------------------------------------------------------------- + +template + inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return AVX::XMVectorPermute(V1, V2); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return AVX::XMVectorSwizzle(V); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return AVX::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); +} + +}; // namespace AVX + +}; // namespace DirectX; diff --git a/Extensions/DirectXMathAVX2.h b/Extensions/DirectXMathAVX2.h index f968b8b..c3cdae2 100644 --- a/Extensions/DirectXMathAVX2.h +++ b/Extensions/DirectXMathAVX2.h @@ -1,972 +1,972 @@ -//------------------------------------------------------------------------------------- -// DirectXMathAVX2.h -- AVX2 extensions for SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#ifdef _MSC_VER -#pragma once -#endif - -#ifdef _M_ARM -#error AVX2 not supported on ARM platform -#endif - -#if defined(_MSC_VER) && (_MSC_VER < 1700) -#error AVX2 intrinsics requires Visual C++ 2012 or later. -#endif - -#pragma warning(push) -#pragma warning(disable : 4987) -#include -#pragma warning(pop) - -#include - -#include -#include - -namespace DirectX -{ -#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) -#define XM_CALLCONV __fastcall -typedef const DirectX::XMVECTOR& HXMVECTOR; -typedef const DirectX::XMMATRIX& FXMMATRIX; -#endif - -namespace AVX2 -{ - -inline bool XMVerifyAVX2Support() -{ - // Should return true for AMD "Excavator", Intel "Haswell" or later processors - // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) - - // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx - int CPUInfo[4] = {-1}; - __cpuid( CPUInfo, 0 ); - - if ( CPUInfo[0] < 7 ) - return false; - - __cpuid(CPUInfo, 1 ); - - // We check for F16C, FMA3, AVX, OSXSAVE, SSSE4.1, and SSE3 - if ( (CPUInfo[2] & 0x38081001) != 0x38081001 ) - return false; - - __cpuidex(CPUInfo, 7, 0); - - return ( (CPUInfo[1] & 0x20 ) == 0x20 ); -} - - -//------------------------------------------------------------------------------------- -// Vector -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr( _In_ const float *pValue ) -{ - return _mm_broadcast_ss( pValue ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSplatX( FXMVECTOR V ) -{ - return _mm_broadcastss_ps( V ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSplatY( FXMVECTOR V ) -{ - return _mm_permute_ps( V, _MM_SHUFFLE(1, 1, 1, 1) ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSplatZ( FXMVECTOR V ) -{ - return _mm_permute_ps( V, _MM_SHUFFLE(2, 2, 2, 2) ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSplatW( FXMVECTOR V ) -{ - return _mm_permute_ps( V, _MM_SHUFFLE(3, 3, 3, 3) ); -} - -inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR V3 -) -{ - return _mm_fmadd_ps( V1, V2, V3 ); -} - -inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR V3 -) -{ - return _mm_fnmadd_ps( V1, V2, V3 ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSwizzle( FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3 ) -{ - assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); - _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); - - unsigned int elem[4] = { E0, E1, E2, E3 }; - __m128i vControl = _mm_loadu_si128( reinterpret_cast(&elem[0]) ); - return _mm_permutevar_ps( V, vControl ); -} - -inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW ) -{ - assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); - _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); - - static const XMVECTORU32 three = { 3, 3, 3, 3 }; - - _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW }; - __m128i vControl = _mm_load_si128( reinterpret_cast(&elem[0]) ); - - __m128i vSelect = _mm_cmpgt_epi32( vControl, three ); - vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) ); - - __m128 shuffled1 = _mm_permutevar_ps( V1, vControl ); - __m128 shuffled2 = _mm_permutevar_ps( V2, vControl ); - - __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 ); - __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 ); - - return _mm_or_ps( masked1, masked2 ); -} - -inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) -{ - assert( Elements < 4 ); - _Analysis_assume_( Elements < 4 ); - return AVX2::XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3)); -} - -inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) -{ - assert( Elements < 4 ); - _Analysis_assume_( Elements < 4 ); - return AVX2::XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 ); -} - -inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) -{ - assert( Elements < 4 ); - _Analysis_assume_( Elements < 4 ); - return AVX2::XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 ); -} - - -//------------------------------------------------------------------------------------- -// Vector2 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVector2Transform -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] ); - XMVECTOR vTemp = _mm_broadcastss_ps(V); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVector2TransformCoord -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] ); - XMVECTOR vTemp = _mm_broadcastss_ps(V); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); - vResult = _mm_div_ps( vResult, W ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVector2TransformNormal -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_mul_ps( vResult, M.r[1] ); - XMVECTOR vTemp = _mm_broadcastss_ps(V); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - return vResult; -} - - -//------------------------------------------------------------------------------------- -// Vector3 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVector3Transform -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z - vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); - vTemp = _mm_broadcastss_ps(V); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVector3TransformCoord -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z - vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); - vTemp = _mm_broadcastss_ps(V); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); - vResult = _mm_div_ps( vResult, W ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVector3TransformNormal -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z - vResult = _mm_mul_ps( vResult, M.r[2] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); - vTemp = _mm_broadcastss_ps(V); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - return vResult; -} - -XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2); - -inline XMVECTOR XM_CALLCONV XMVector3Project -( - FXMVECTOR V, - float ViewportX, - float ViewportY, - float ViewportWidth, - float ViewportHeight, - float ViewportMinZ, - float ViewportMaxZ, - CXMMATRIX Projection, - CXMMATRIX View, - CXMMATRIX World -) -{ - const float HalfViewportWidth = ViewportWidth * 0.5f; - const float HalfViewportHeight = ViewportHeight * 0.5f; - - XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); - XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); - - XMMATRIX Transform = AVX2::XMMatrixMultiply(World, View); - Transform = AVX2::XMMatrixMultiply(Transform, Projection); - - XMVECTOR Result = AVX2::XMVector3TransformCoord(V, Transform); - - Result = AVX2::XMVectorMultiplyAdd(Result, Scale, Offset); - - return Result; -} - -inline XMVECTOR XM_CALLCONV XMVector3Unproject -( - FXMVECTOR V, - float ViewportX, - float ViewportY, - float ViewportWidth, - float ViewportHeight, - float ViewportMinZ, - float ViewportMaxZ, - CXMMATRIX Projection, - CXMMATRIX View, - CXMMATRIX World -) -{ - static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; - - XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); - Scale = XMVectorReciprocal(Scale); - - XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); - Offset = AVX2::XMVectorMultiplyAdd(Scale, Offset, D.v); - - XMMATRIX Transform = AVX2::XMMatrixMultiply(World, View); - Transform = AVX2::XMMatrixMultiply(Transform, Projection); - Transform = XMMatrixInverse(nullptr, Transform); - - XMVECTOR Result = AVX2::XMVectorMultiplyAdd(V, Scale, Offset); - - return AVX2::XMVector3TransformCoord(Result, Transform); -} - - -//------------------------------------------------------------------------------------- -// Vector4 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVector4Transform -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W - vResult = _mm_mul_ps( vResult, M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z - vResult = _mm_fmadd_ps( vTemp, M.r[2], vResult ); - vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); - vTemp = _mm_broadcastss_ps(V); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - return vResult; -} - - -//------------------------------------------------------------------------------------- -// Matrix -//------------------------------------------------------------------------------------- - -inline XMMATRIX XM_CALLCONV XMMatrixMultiply -( - CXMMATRIX M1, - CXMMATRIX M2 -) -{ - XMMATRIX mResult; - // Use vW to hold the original row - XMVECTOR vW = M1.r[0]; - // Splat the component X,Y,Z then W - XMVECTOR vX = _mm_broadcastss_ps(vW); - XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - // Perform the operation on the first row - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - mResult.r[0] = vX; - // Repeat for the other 3 rows - vW = M1.r[1]; - vX = _mm_broadcastss_ps(vW); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - mResult.r[1] = vX; - vW = M1.r[2]; - vX = _mm_broadcastss_ps(vW); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - mResult.r[2] = vX; - vW = M1.r[3]; - vX = _mm_broadcastss_ps(vW); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - mResult.r[3] = vX; - return mResult; -} - -inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose -( - FXMMATRIX M1, - CXMMATRIX M2 -) -{ - // Use vW to hold the original row - XMVECTOR vW = M1.r[0]; - // Splat the component X,Y,Z then W - XMVECTOR vX = _mm_broadcastss_ps(vW); - XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - // Perform the operation on the first row - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - __m128 r0 = vX; - // Repeat for the other 3 rows - vW = M1.r[1]; - vX = _mm_broadcastss_ps(vW); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - __m128 r1 = vX; - vW = M1.r[2]; - vX = _mm_broadcastss_ps(vW); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - __m128 r2 = vX; - vW = M1.r[3]; - vX = _mm_broadcastss_ps(vW); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - __m128 r3 = vX; - - // x.x,x.y,y.x,y.y - XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); - // x.z,x.w,y.z,y.w - XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); - // z.x,z.y,w.x,w.y - XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); - // z.z,z.w,w.z,w.w - XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); - - XMMATRIX mResult; - // x.x,y.x,z.x,w.x - mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); - // x.y,y.y,z.y,w.y - mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); - // x.z,y.z,z.z,w.z - mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); - // x.w,y.w,z.w,w.w - mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); - return mResult; -} - - -//------------------------------------------------------------------------------------- -// Permute Templates -//------------------------------------------------------------------------------------- - -namespace Internal -{ - // Slow path fallback for permutes that do not map to a single SSE opcode. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) - { - static const XMVECTORU32 selectMask = - { - WhichX ? 0xFFFFFFFF : 0, - WhichY ? 0xFFFFFFFF : 0, - WhichZ ? 0xFFFFFFFF : 0, - WhichW ? 0xFFFFFFFF : 0, - }; - - XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle); - XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle); - - XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); - XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); - - return _mm_or_ps(masked1, masked2); - } - }; - - // Fast path for permutes that only read from the first vector. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); } - }; - - // Fast path for permutes that only read from the second vector. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_permute_ps(v2, Shuffle); } - }; - - // Fast path for permutes that read XY from the first vector, ZW from the second. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); } - }; - - // Fast path for permutes that read XY from the second vector, ZW from the first. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); } - }; -}; - -// General permute template -template - inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) -{ - static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); - static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); - static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); - static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); - - const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); - - const bool WhichX = PermuteX > 3; - const bool WhichY = PermuteY > 3; - const bool WhichZ = PermuteZ > 3; - const bool WhichW = PermuteW > 3; - - return AVX2::Internal::PermuteHelper::Permute(V1, V2); -} - -// Special-case permute templates -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); } - - -//------------------------------------------------------------------------------------- -// Swizzle Templates -//------------------------------------------------------------------------------------- - -// General swizzle template -template - inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) -{ - static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); - static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); - static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); - static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); - - return _mm_permute_ps( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) ); -} - -// Specialized swizzles -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return _mm_broadcastss_ps(V); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); } - - -//------------------------------------------------------------------------------------- -// Other Templates -//------------------------------------------------------------------------------------- - -template - inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) -{ - static_assert( Elements < 4, "Elements template parameter out of range" ); - return AVX2::XMVectorPermute(V1, V2); -} - -template - inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) -{ - static_assert( Elements < 4, "Elements template parameter out of range" ); - return AVX2::XMVectorSwizzle(V); -} - -template - inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) -{ - static_assert( Elements < 4, "Elements template parameter out of range" ); - return AVX2::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); -} - -//------------------------------------------------------------------------------------- -// Data conversion -//------------------------------------------------------------------------------------- - -inline float XMConvertHalfToFloat( PackedVector::HALF Value ) -{ - __m128i V1 = _mm_cvtsi32_si128( static_cast(Value) ); - __m128 V2 = _mm_cvtph_ps( V1 ); - return _mm_cvtss_f32( V2 ); -} - -inline PackedVector::HALF XMConvertFloatToHalf( float Value ) -{ - __m128 V1 = _mm_set_ss( Value ); - __m128i V2 = _mm_cvtps_ph( V1, 0 ); - return static_cast( _mm_cvtsi128_si32(V2) ); -} - -inline float* XMConvertHalfToFloatStream -( - _Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(2+InputStride*(HalfCount-1)) const PackedVector::HALF* pInputStream, - _In_ size_t InputStride, - _In_ size_t HalfCount -) -{ - using namespace PackedVector; - - assert(pOutputStream); - assert(pInputStream); - const uint8_t* pHalf = reinterpret_cast(pInputStream); - uint8_t* pFloat = reinterpret_cast(pOutputStream); - - size_t i = 0; - size_t four = HalfCount >> 2; - if ( four > 0 ) - { - if (InputStride == sizeof(HALF)) - { - if (OutputStride == sizeof(float)) - { - if ( ((uintptr_t)pFloat & 0xF) == 0) - { - // Packed input, aligned & packed output - for (size_t j = 0; j < four; ++j) - { - __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); - pHalf += InputStride*4; - - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_stream_ps( reinterpret_cast(pFloat), FV ); - pFloat += OutputStride*4; - i += 4; - } - } - else - { - // Packed input, packed output - for (size_t j = 0; j < four; ++j) - { - __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); - pHalf += InputStride*4; - - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_storeu_ps( reinterpret_cast(pFloat), FV ); - pFloat += OutputStride*4; - i += 4; - } - } - } - else - { - // Packed input, scattered output - for (size_t j = 0; j < four; ++j) - { - __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); - pHalf += InputStride*4; - - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_store_ss( reinterpret_cast(pFloat), FV ); - pFloat += OutputStride; - *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 1 ); - pFloat += OutputStride; - *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 2 ); - pFloat += OutputStride; - *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 3 ); - pFloat += OutputStride; - i += 4; - } - } - } - else if (OutputStride == sizeof(float)) - { - if ( ((uintptr_t)pFloat & 0xF) == 0) - { - // Scattered input, aligned & packed output - for (size_t j = 0; j < four; ++j) - { - uint16_t H1 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H2 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H3 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H4 = *reinterpret_cast(pHalf); - pHalf += InputStride; - - __m128i HV = _mm_setzero_si128(); - HV = _mm_insert_epi16( HV, H1, 0 ); - HV = _mm_insert_epi16( HV, H2, 1 ); - HV = _mm_insert_epi16( HV, H3, 2 ); - HV = _mm_insert_epi16( HV, H4, 3 ); - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_stream_ps( reinterpret_cast(pFloat ), FV ); - pFloat += OutputStride*4; - i += 4; - } - } - else - { - // Scattered input, packed output - for (size_t j = 0; j < four; ++j) - { - uint16_t H1 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H2 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H3 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H4 = *reinterpret_cast(pHalf); - pHalf += InputStride; - - __m128i HV = _mm_setzero_si128(); - HV = _mm_insert_epi16( HV, H1, 0 ); - HV = _mm_insert_epi16( HV, H2, 1 ); - HV = _mm_insert_epi16( HV, H3, 2 ); - HV = _mm_insert_epi16( HV, H4, 3 ); - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_storeu_ps( reinterpret_cast(pFloat ), FV ); - pFloat += OutputStride*4; - i += 4; - } - } - } - } - - for (; i < HalfCount; ++i) - { - *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); - pHalf += InputStride; - pFloat += OutputStride; - } - - return pOutputStream; -} - - -inline PackedVector::HALF* XMConvertFloatToHalfStream -( - _Out_writes_bytes_(2+OutputStride*(FloatCount-1)) PackedVector::HALF* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, - _In_ size_t InputStride, - _In_ size_t FloatCount -) -{ - using namespace PackedVector; - - assert(pOutputStream); - assert(pInputStream); - const uint8_t* pFloat = reinterpret_cast(pInputStream); - uint8_t* pHalf = reinterpret_cast(pOutputStream); - - size_t i = 0; - size_t four = FloatCount >> 2; - if (four > 0) - { - if (InputStride == sizeof(float)) - { - if (OutputStride == sizeof(HALF)) - { - if ( ((uintptr_t)pFloat & 0xF) == 0) - { - // Aligned and packed input, packed output - for (size_t j = 0; j < four; ++j) - { - __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); - pFloat += InputStride*4; - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); - pHalf += OutputStride*4; - i += 4; - } - } - else - { - // Packed input, packed output - for (size_t j = 0; j < four; ++j) - { - __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); - pFloat += InputStride*4; - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); - pHalf += OutputStride*4; - i += 4; - } - } - } - else - { - if ( ((uintptr_t)pFloat & 0xF) == 0) - { - // Aligned & packed input, scattered output - for (size_t j = 0; j < four; ++j) - { - __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); - pFloat += InputStride*4; - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); - pHalf += OutputStride; - i += 4; - } - } - else - { - // Packed input, scattered output - for (size_t j = 0; j < four; ++j) - { - __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); - pFloat += InputStride*4; - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); - pHalf += OutputStride; - i += 4; - } - } - } - } - else if (OutputStride == sizeof(HALF)) - { - // Scattered input, packed output - for (size_t j = 0; j < four; ++j) - { - __m128 FV1 = _mm_load_ss( reinterpret_cast(pFloat) ); - pFloat += InputStride; - - __m128 FV2 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); - pFloat += InputStride; - - __m128 FV3 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); - pFloat += InputStride; - - __m128 FV4 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); - pFloat += InputStride; - - __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 ); - __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 ); - FV = _mm_blend_ps( FV, FT, 0xC ); - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); - pHalf += OutputStride*4; - i += 4; - } - } - } - - for (; i < FloatCount; ++i) - { - *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); - pFloat += InputStride; - pHalf += OutputStride; - } - - return pOutputStream; -} - - -//------------------------------------------------------------------------------------- -// Half2 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMLoadHalf2( _In_ const PackedVector::XMHALF2* pSource ) -{ - assert(pSource); - __m128 V = _mm_load_ss( reinterpret_cast(pSource) ); - return _mm_cvtph_ps( _mm_castps_si128( V ) ); -} - -inline void XM_CALLCONV XMStoreHalf2( _Out_ PackedVector::XMHALF2* pDestination, _In_ FXMVECTOR V ) -{ - assert(pDestination); - __m128i V1 = _mm_cvtps_ph( V, 0 ); - _mm_store_ss( reinterpret_cast(pDestination), _mm_castsi128_ps(V1) ); -} - - -//------------------------------------------------------------------------------------- -// Half4 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMLoadHalf4( _In_ const PackedVector::XMHALF4* pSource ) -{ - assert(pSource); - __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); - return _mm_cvtph_ps( V ); -} - -inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination, _In_ FXMVECTOR V ) -{ - assert(pDestination); - __m128i V1 = _mm_cvtps_ph( V, 0 ); - _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 ); -} - -}; // namespace AVX2 - -}; // namespace DirectX; +//------------------------------------------------------------------------------------- +// DirectXMathAVX2.h -- AVX2 extensions for SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#ifdef _M_ARM +#error AVX2 not supported on ARM platform +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1700) +#error AVX2 intrinsics requires Visual C++ 2012 or later. +#endif + +#pragma warning(push) +#pragma warning(disable : 4987) +#include +#pragma warning(pop) + +#include + +#include +#include + +namespace DirectX +{ +#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) +#define XM_CALLCONV __fastcall +typedef const DirectX::XMVECTOR& HXMVECTOR; +typedef const DirectX::XMMATRIX& FXMMATRIX; +#endif + +namespace AVX2 +{ + +inline bool XMVerifyAVX2Support() +{ + // Should return true for AMD "Excavator", Intel "Haswell" or later processors + // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = {-1}; + __cpuid( CPUInfo, 0 ); + + if ( CPUInfo[0] < 7 ) + return false; + + __cpuid(CPUInfo, 1 ); + + // We check for F16C, FMA3, AVX, OSXSAVE, SSSE4.1, and SSE3 + if ( (CPUInfo[2] & 0x38081001) != 0x38081001 ) + return false; + + __cpuidex(CPUInfo, 7, 0); + + return ( (CPUInfo[1] & 0x20 ) == 0x20 ); +} + + +//------------------------------------------------------------------------------------- +// Vector +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr( _In_ const float *pValue ) +{ + return _mm_broadcast_ss( pValue ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatX( FXMVECTOR V ) +{ + return _mm_broadcastss_ps( V ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatY( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(1, 1, 1, 1) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatZ( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(2, 2, 2, 2) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSplatW( FXMVECTOR V ) +{ + return _mm_permute_ps( V, _MM_SHUFFLE(3, 3, 3, 3) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + return _mm_fmadd_ps( V1, V2, V3 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + return _mm_fnmadd_ps( V1, V2, V3 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle( FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3 ) +{ + assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); + _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); + + unsigned int elem[4] = { E0, E1, E2, E3 }; + __m128i vControl = _mm_loadu_si128( reinterpret_cast(&elem[0]) ); + return _mm_permutevar_ps( V, vControl ); +} + +inline XMVECTOR XM_CALLCONV XMVectorPermute( FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW ) +{ + assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + + static const XMVECTORU32 three = { 3, 3, 3, 3 }; + + _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW }; + __m128i vControl = _mm_load_si128( reinterpret_cast(&elem[0]) ); + + __m128i vSelect = _mm_cmpgt_epi32( vControl, three ); + vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) ); + + __m128 shuffled1 = _mm_permutevar_ps( V1, vControl ); + __m128 shuffled2 = _mm_permutevar_ps( V2, vControl ); + + __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 ); + __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 ); + + return _mm_or_ps( masked1, masked2 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return AVX2::XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3)); +} + +inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return AVX2::XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return AVX2::XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 ); +} + + +//------------------------------------------------------------------------------------- +// Vector2 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector2Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] ); + XMVECTOR vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector2TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] ); + XMVECTOR vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps( vResult, W ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector2TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_mul_ps( vResult, M.r[1] ); + XMVECTOR vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Vector3 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector3Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector3TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps( vResult, W ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector3TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_mul_ps( vResult, M.r[2] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2); + +inline XMVECTOR XM_CALLCONV XMVector3Project +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = AVX2::XMMatrixMultiply(World, View); + Transform = AVX2::XMMatrixMultiply(Transform, Projection); + + XMVECTOR Result = AVX2::XMVector3TransformCoord(V, Transform); + + Result = AVX2::XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; +} + +inline XMVECTOR XM_CALLCONV XMVector3Unproject +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = AVX2::XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = AVX2::XMMatrixMultiply(World, View); + Transform = AVX2::XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + XMVECTOR Result = AVX2::XMVectorMultiplyAdd(V, Scale, Offset); + + return AVX2::XMVector3TransformCoord(Result, Transform); +} + + +//------------------------------------------------------------------------------------- +// Vector4 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector4Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W + vResult = _mm_mul_ps( vResult, M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_fmadd_ps( vTemp, M.r[2], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_broadcastss_ps(V); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Matrix +//------------------------------------------------------------------------------------- + +inline XMMATRIX XM_CALLCONV XMMatrixMultiply +( + CXMMATRIX M1, + CXMMATRIX M2 +) +{ + XMMATRIX mResult; + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_broadcastss_ps(vW); + XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[0] = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_broadcastss_ps(vW); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[1] = vX; + vW = M1.r[2]; + vX = _mm_broadcastss_ps(vW); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[2] = vX; + vW = M1.r[3]; + vX = _mm_broadcastss_ps(vW); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[3] = vX; + return mResult; +} + +inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose +( + FXMMATRIX M1, + CXMMATRIX M2 +) +{ + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_broadcastss_ps(vW); + XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r0 = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_broadcastss_ps(vW); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r1 = vX; + vW = M1.r[2]; + vX = _mm_broadcastss_ps(vW); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r2 = vX; + vW = M1.r[3]; + vX = _mm_broadcastss_ps(vW); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r3 = vX; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +} + + +//------------------------------------------------------------------------------------- +// Permute Templates +//------------------------------------------------------------------------------------- + +namespace Internal +{ + // Slow path fallback for permutes that do not map to a single SSE opcode. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) + { + static const XMVECTORU32 selectMask = + { + WhichX ? 0xFFFFFFFF : 0, + WhichY ? 0xFFFFFFFF : 0, + WhichZ ? 0xFFFFFFFF : 0, + WhichW ? 0xFFFFFFFF : 0, + }; + + XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle); + XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle); + + XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); + XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); + + return _mm_or_ps(masked1, masked2); + } + }; + + // Fast path for permutes that only read from the first vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); } + }; + + // Fast path for permutes that only read from the second vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return _mm_permute_ps(v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the first vector, ZW from the second. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the second vector, ZW from the first. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); } + }; +}; + +// General permute template +template + inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); + static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); + static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); + static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); + + const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); + + const bool WhichX = PermuteX > 3; + const bool WhichY = PermuteY > 3; + const bool WhichZ = PermuteZ > 3; + const bool WhichW = PermuteW > 3; + + return AVX2::Internal::PermuteHelper::Permute(V1, V2); +} + +// Special-case permute templates +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); } + + +//------------------------------------------------------------------------------------- +// Swizzle Templates +//------------------------------------------------------------------------------------- + +// General swizzle template +template + inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) +{ + static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); + static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); + static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); + static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); + + return _mm_permute_ps( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) ); +} + +// Specialized swizzles +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return _mm_broadcastss_ps(V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); } + + +//------------------------------------------------------------------------------------- +// Other Templates +//------------------------------------------------------------------------------------- + +template + inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return AVX2::XMVectorPermute(V1, V2); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return AVX2::XMVectorSwizzle(V); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return AVX2::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); +} + +//------------------------------------------------------------------------------------- +// Data conversion +//------------------------------------------------------------------------------------- + +inline float XMConvertHalfToFloat( PackedVector::HALF Value ) +{ + __m128i V1 = _mm_cvtsi32_si128( static_cast(Value) ); + __m128 V2 = _mm_cvtph_ps( V1 ); + return _mm_cvtss_f32( V2 ); +} + +inline PackedVector::HALF XMConvertFloatToHalf( float Value ) +{ + __m128 V1 = _mm_set_ss( Value ); + __m128i V2 = _mm_cvtps_ph( V1, 0 ); + return static_cast( _mm_cvtsi128_si32(V2) ); +} + +inline float* XMConvertHalfToFloatStream +( + _Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(2+InputStride*(HalfCount-1)) const PackedVector::HALF* pInputStream, + _In_ size_t InputStride, + _In_ size_t HalfCount +) +{ + using namespace PackedVector; + + assert(pOutputStream); + assert(pInputStream); + const uint8_t* pHalf = reinterpret_cast(pInputStream); + uint8_t* pFloat = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = HalfCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(HALF)) + { + if (OutputStride == sizeof(float)) + { + if ( ((uintptr_t)pFloat & 0xF) == 0) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); + pHalf += InputStride*4; + + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_stream_ps( reinterpret_cast(pFloat), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); + pHalf += InputStride*4; + + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_storeu_ps( reinterpret_cast(pFloat), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); + pHalf += InputStride*4; + + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_store_ss( reinterpret_cast(pFloat), FV ); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 1 ); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 2 ); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 3 ); + pFloat += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(float)) + { + if ( ((uintptr_t)pFloat & 0xF) == 0) + { + // Scattered input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16( HV, H1, 0 ); + HV = _mm_insert_epi16( HV, H2, 1 ); + HV = _mm_insert_epi16( HV, H3, 2 ); + HV = _mm_insert_epi16( HV, H4, 3 ); + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_stream_ps( reinterpret_cast(pFloat ), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + else + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16( HV, H1, 0 ); + HV = _mm_insert_epi16( HV, H2, 1 ); + HV = _mm_insert_epi16( HV, H3, 2 ); + HV = _mm_insert_epi16( HV, H4, 3 ); + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_storeu_ps( reinterpret_cast(pFloat ), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + } + } + + for (; i < HalfCount; ++i) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; +} + + +inline PackedVector::HALF* XMConvertFloatToHalfStream +( + _Out_writes_bytes_(2+OutputStride*(FloatCount-1)) PackedVector::HALF* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, + _In_ size_t InputStride, + _In_ size_t FloatCount +) +{ + using namespace PackedVector; + + assert(pOutputStream); + assert(pInputStream); + const uint8_t* pFloat = reinterpret_cast(pInputStream); + uint8_t* pHalf = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = FloatCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(float)) + { + if (OutputStride == sizeof(HALF)) + { + if ( ((uintptr_t)pFloat & 0xF) == 0) + { + // Aligned and packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); + pHalf += OutputStride*4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); + pHalf += OutputStride*4; + i += 4; + } + } + } + else + { + if ( ((uintptr_t)pFloat & 0xF) == 0) + { + // Aligned & packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); + pHalf += OutputStride; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); + pHalf += OutputStride; + i += 4; + } + } + } + } + else if (OutputStride == sizeof(HALF)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 ); + __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 ); + FV = _mm_blend_ps( FV, FT, 0xC ); + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); + pHalf += OutputStride*4; + i += 4; + } + } + } + + for (; i < FloatCount; ++i) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + + return pOutputStream; +} + + +//------------------------------------------------------------------------------------- +// Half2 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMLoadHalf2( _In_ const PackedVector::XMHALF2* pSource ) +{ + assert(pSource); + __m128 V = _mm_load_ss( reinterpret_cast(pSource) ); + return _mm_cvtph_ps( _mm_castps_si128( V ) ); +} + +inline void XM_CALLCONV XMStoreHalf2( _Out_ PackedVector::XMHALF2* pDestination, _In_ FXMVECTOR V ) +{ + assert(pDestination); + __m128i V1 = _mm_cvtps_ph( V, 0 ); + _mm_store_ss( reinterpret_cast(pDestination), _mm_castsi128_ps(V1) ); +} + + +//------------------------------------------------------------------------------------- +// Half4 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMLoadHalf4( _In_ const PackedVector::XMHALF4* pSource ) +{ + assert(pSource); + __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); + return _mm_cvtph_ps( V ); +} + +inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination, _In_ FXMVECTOR V ) +{ + assert(pDestination); + __m128i V1 = _mm_cvtps_ph( V, 0 ); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 ); +} + +}; // namespace AVX2 + +}; // namespace DirectX; diff --git a/Extensions/DirectXMathBE.h b/Extensions/DirectXMathBE.h index 3cc4e6c..3b8e4aa 100644 --- a/Extensions/DirectXMathBE.h +++ b/Extensions/DirectXMathBE.h @@ -1,103 +1,103 @@ -//------------------------------------------------------------------------------------- -// DirectXMathBE.h -- Big-endian swap extensions for SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#ifdef _MSC_VER -#pragma once -#endif - -#pragma warning(push) -#pragma warning(disable : 4987) -#include -#pragma warning(pop) - -#ifndef _M_ARM -#include -#endif - -#include - -namespace DirectX -{ -#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) -#define XM_CALLCONV __fastcall -typedef const DirectX::XMVECTOR& HXMVECTOR; -typedef const DirectX::XMMATRIX& FXMMATRIX; -#endif - -inline XMVECTOR XM_CALLCONV XMVectorEndian -( - FXMVECTOR V -) -{ -#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - static const XMVECTORU32 idx = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F }; - - int8x8x2_t tbl; - tbl.val[0] = vget_low_f32(V); - tbl.val[1] = vget_high_f32(V); - - const __n64 rL = vtbl2_u8( tbl, vget_low_f32(idx) ); - const __n64 rH = vtbl2_u8( tbl, vget_high_f32(idx) ); - return vcombine_f32( rL, rH ); -#else - XMVECTORU32 E; - E.v = V; - uint32_t value = E.u[0]; - E.u[0] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) ); - value = E.u[1]; - E.u[1] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) ); - value = E.u[2]; - E.u[2] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) ); - value = E.u[3]; - E.u[3] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) ); - return E.v; -#endif -} - - -#ifndef _M_ARM -namespace SSSE3 -{ - -inline bool XMVerifySSSE3Support() -{ - // Should return true on AMD Bulldozer, Intel Core i7/i5/i3, Intel Atom, or later processors - - // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx - int CPUInfo[4] = {-1}; - __cpuid( CPUInfo, 0 ); - - if ( CPUInfo[0] < 1 ) - return false; - - __cpuid(CPUInfo, 1 ); - - // Check for SSSE3 instruction set. - return ( (CPUInfo[2] & 0x200) != 0 ); -} - -inline XMVECTOR XM_CALLCONV XMVectorEndian -( - FXMVECTOR V -) -{ - static const XMVECTORU32 idx = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F }; - - __m128i Result = _mm_shuffle_epi8( _mm_castps_si128(V), idx ); - return _mm_castsi128_ps( Result ); -} - -}; // namespace SSSE3 -#endif // !_M_ARM - +//------------------------------------------------------------------------------------- +// DirectXMathBE.h -- Big-endian swap extensions for SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#pragma warning(push) +#pragma warning(disable : 4987) +#include +#pragma warning(pop) + +#ifndef _M_ARM +#include +#endif + +#include + +namespace DirectX +{ +#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) +#define XM_CALLCONV __fastcall +typedef const DirectX::XMVECTOR& HXMVECTOR; +typedef const DirectX::XMMATRIX& FXMMATRIX; +#endif + +inline XMVECTOR XM_CALLCONV XMVectorEndian +( + FXMVECTOR V +) +{ +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORU32 idx = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F }; + + int8x8x2_t tbl; + tbl.val[0] = vget_low_f32(V); + tbl.val[1] = vget_high_f32(V); + + const __n64 rL = vtbl2_u8( tbl, vget_low_f32(idx) ); + const __n64 rH = vtbl2_u8( tbl, vget_high_f32(idx) ); + return vcombine_f32( rL, rH ); +#else + XMVECTORU32 E; + E.v = V; + uint32_t value = E.u[0]; + E.u[0] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) ); + value = E.u[1]; + E.u[1] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) ); + value = E.u[2]; + E.u[2] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) ); + value = E.u[3]; + E.u[3] = ( (value << 24) | ((value & 0xFF00) << 8) | ((value & 0xFF0000) >> 8) | (value >> 24) ); + return E.v; +#endif +} + + +#ifndef _M_ARM +namespace SSSE3 +{ + +inline bool XMVerifySSSE3Support() +{ + // Should return true on AMD Bulldozer, Intel Core i7/i5/i3, Intel Atom, or later processors + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = {-1}; + __cpuid( CPUInfo, 0 ); + + if ( CPUInfo[0] < 1 ) + return false; + + __cpuid(CPUInfo, 1 ); + + // Check for SSSE3 instruction set. + return ( (CPUInfo[2] & 0x200) != 0 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorEndian +( + FXMVECTOR V +) +{ + static const XMVECTORU32 idx = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F }; + + __m128i Result = _mm_shuffle_epi8( _mm_castps_si128(V), idx ); + return _mm_castsi128_ps( Result ); +} + +}; // namespace SSSE3 +#endif // !_M_ARM + }; // namespace DirectX; \ No newline at end of file diff --git a/Extensions/DirectXMathF16C.h b/Extensions/DirectXMathF16C.h index d486ed2..902f661 100644 --- a/Extensions/DirectXMathF16C.h +++ b/Extensions/DirectXMathF16C.h @@ -1,410 +1,410 @@ -//------------------------------------------------------------------------------------- -// DirectXMathF16C.h -- F16C/CVT16 extensions for SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#ifdef _MSC_VER -#pragma once -#endif - -#ifdef _M_ARM -#error F16C not supported on ARM platform -#endif - -#if defined(_MSC_VER) && (_MSC_VER < 1700) -#error F16C/CVT16 intrinsics requires Visual C++ 2012 or later. -#endif - -#pragma warning(push) -#pragma warning(disable : 4987) -#include -#pragma warning(pop) - -#include - -#include -#include - -namespace DirectX -{ -#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) -#define XM_CALLCONV __fastcall -typedef const DirectX::XMVECTOR& HXMVECTOR; -typedef const DirectX::XMMATRIX& FXMMATRIX; -#endif - -namespace F16C -{ - -inline bool XMVerifyF16CSupport() -{ - // Should return true for AMD "Piledriver" and Intel "Ivy Bridge" processors - // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) - - // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx - int CPUInfo[4] = {-1}; - __cpuid( CPUInfo, 0 ); - - if ( CPUInfo[0] < 1 ) - return false; - - __cpuid(CPUInfo, 1 ); - - // We check for F16C, AVX, OSXSAVE, and SSE4.1 - return ( (CPUInfo[2] & 0x38080000 ) == 0x38080000 ); -} - - -//------------------------------------------------------------------------------------- -// Data conversion -//------------------------------------------------------------------------------------- - -inline float XMConvertHalfToFloat( PackedVector::HALF Value ) -{ - __m128i V1 = _mm_cvtsi32_si128( static_cast(Value) ); - __m128 V2 = _mm_cvtph_ps( V1 ); - return _mm_cvtss_f32( V2 ); -} - -inline PackedVector::HALF XMConvertFloatToHalf( float Value ) -{ - __m128 V1 = _mm_set_ss( Value ); - __m128i V2 = _mm_cvtps_ph( V1, 0 ); - return static_cast( _mm_cvtsi128_si32(V2) ); -} - -inline float* XMConvertHalfToFloatStream -( - _Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(2+InputStride*(HalfCount-1)) const PackedVector::HALF* pInputStream, - _In_ size_t InputStride, - _In_ size_t HalfCount -) -{ - using namespace PackedVector; - - assert(pOutputStream); - assert(pInputStream); - const uint8_t* pHalf = reinterpret_cast(pInputStream); - uint8_t* pFloat = reinterpret_cast(pOutputStream); - - size_t i = 0; - size_t four = HalfCount >> 2; - if ( four > 0 ) - { - if (InputStride == sizeof(HALF)) - { - if (OutputStride == sizeof(float)) - { - if ( ((uintptr_t)pFloat & 0xF) == 0) - { - // Packed input, aligned & packed output - for (size_t j = 0; j < four; ++j) - { - __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); - pHalf += InputStride*4; - - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_stream_ps( reinterpret_cast(pFloat), FV ); - pFloat += OutputStride*4; - i += 4; - } - } - else - { - // Packed input, packed output - for (size_t j = 0; j < four; ++j) - { - __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); - pHalf += InputStride*4; - - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_storeu_ps( reinterpret_cast(pFloat), FV ); - pFloat += OutputStride*4; - i += 4; - } - } - } - else - { - // Packed input, scattered output - for (size_t j = 0; j < four; ++j) - { - __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); - pHalf += InputStride*4; - - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_store_ss( reinterpret_cast(pFloat), FV ); - pFloat += OutputStride; - *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 1 ); - pFloat += OutputStride; - *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 2 ); - pFloat += OutputStride; - *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 3 ); - pFloat += OutputStride; - i += 4; - } - } - } - else if (OutputStride == sizeof(float)) - { - if ( ((uintptr_t)pFloat & 0xF) == 0) - { - // Scattered input, aligned & packed output - for (size_t j = 0; j < four; ++j) - { - uint16_t H1 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H2 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H3 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H4 = *reinterpret_cast(pHalf); - pHalf += InputStride; - - __m128i HV = _mm_setzero_si128(); - HV = _mm_insert_epi16( HV, H1, 0 ); - HV = _mm_insert_epi16( HV, H2, 1 ); - HV = _mm_insert_epi16( HV, H3, 2 ); - HV = _mm_insert_epi16( HV, H4, 3 ); - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_stream_ps( reinterpret_cast(pFloat ), FV ); - pFloat += OutputStride*4; - i += 4; - } - } - else - { - // Scattered input, packed output - for (size_t j = 0; j < four; ++j) - { - uint16_t H1 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H2 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H3 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H4 = *reinterpret_cast(pHalf); - pHalf += InputStride; - - __m128i HV = _mm_setzero_si128(); - HV = _mm_insert_epi16( HV, H1, 0 ); - HV = _mm_insert_epi16( HV, H2, 1 ); - HV = _mm_insert_epi16( HV, H3, 2 ); - HV = _mm_insert_epi16( HV, H4, 3 ); - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_storeu_ps( reinterpret_cast(pFloat ), FV ); - pFloat += OutputStride*4; - i += 4; - } - } - } - } - - for (; i < HalfCount; ++i) - { - *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); - pHalf += InputStride; - pFloat += OutputStride; - } - - return pOutputStream; -} - - -inline PackedVector::HALF* XMConvertFloatToHalfStream -( - _Out_writes_bytes_(2+OutputStride*(FloatCount-1)) PackedVector::HALF* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, - _In_ size_t InputStride, - _In_ size_t FloatCount -) -{ - using namespace PackedVector; - - assert(pOutputStream); - assert(pInputStream); - const uint8_t* pFloat = reinterpret_cast(pInputStream); - uint8_t* pHalf = reinterpret_cast(pOutputStream); - - size_t i = 0; - size_t four = FloatCount >> 2; - if (four > 0) - { - if (InputStride == sizeof(float)) - { - if (OutputStride == sizeof(HALF)) - { - if ( ((uintptr_t)pFloat & 0xF) == 0) - { - // Aligned and packed input, packed output - for (size_t j = 0; j < four; ++j) - { - __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); - pFloat += InputStride*4; - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); - pHalf += OutputStride*4; - i += 4; - } - } - else - { - // Packed input, packed output - for (size_t j = 0; j < four; ++j) - { - __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); - pFloat += InputStride*4; - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); - pHalf += OutputStride*4; - i += 4; - } - } - } - else - { - if ( ((uintptr_t)pFloat & 0xF) == 0) - { - // Aligned & packed input, scattered output - for (size_t j = 0; j < four; ++j) - { - __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); - pFloat += InputStride*4; - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); - pHalf += OutputStride; - i += 4; - } - } - else - { - // Packed input, scattered output - for (size_t j = 0; j < four; ++j) - { - __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); - pFloat += InputStride*4; - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); - pHalf += OutputStride; - i += 4; - } - } - } - } - else if (OutputStride == sizeof(HALF)) - { - // Scattered input, packed output - for (size_t j = 0; j < four; ++j) - { - __m128 FV1 = _mm_load_ss( reinterpret_cast(pFloat) ); - pFloat += InputStride; - - __m128 FV2 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); - pFloat += InputStride; - - __m128 FV3 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); - pFloat += InputStride; - - __m128 FV4 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); - pFloat += InputStride; - - __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 ); - __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 ); - FV = _mm_blend_ps( FV, FT, 0xC ); - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); - pHalf += OutputStride*4; - i += 4; - } - } - } - - for (; i < FloatCount; ++i) - { - *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); - pFloat += InputStride; - pHalf += OutputStride; - } - - return pOutputStream; -} - - -//------------------------------------------------------------------------------------- -// Half2 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMLoadHalf2( _In_ const PackedVector::XMHALF2* pSource ) -{ - assert(pSource); - __m128 V = _mm_load_ss( reinterpret_cast(pSource) ); - return _mm_cvtph_ps( _mm_castps_si128( V ) ); -} - -inline void XM_CALLCONV XMStoreHalf2( _Out_ PackedVector::XMHALF2* pDestination, _In_ FXMVECTOR V ) -{ - assert(pDestination); - __m128i V1 = _mm_cvtps_ph( V, 0 ); - _mm_store_ss( reinterpret_cast(pDestination), _mm_castsi128_ps(V1) ); -} - - -//------------------------------------------------------------------------------------- -// Half4 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMLoadHalf4( _In_ const PackedVector::XMHALF4* pSource ) -{ - assert(pSource); - __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); - return _mm_cvtph_ps( V ); -} - -inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination, _In_ FXMVECTOR V ) -{ - assert(pDestination); - __m128i V1 = _mm_cvtps_ph( V, 0 ); - _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 ); -} - -}; // namespace F16C - +//------------------------------------------------------------------------------------- +// DirectXMathF16C.h -- F16C/CVT16 extensions for SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#ifdef _M_ARM +#error F16C not supported on ARM platform +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1700) +#error F16C/CVT16 intrinsics requires Visual C++ 2012 or later. +#endif + +#pragma warning(push) +#pragma warning(disable : 4987) +#include +#pragma warning(pop) + +#include + +#include +#include + +namespace DirectX +{ +#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) +#define XM_CALLCONV __fastcall +typedef const DirectX::XMVECTOR& HXMVECTOR; +typedef const DirectX::XMMATRIX& FXMMATRIX; +#endif + +namespace F16C +{ + +inline bool XMVerifyF16CSupport() +{ + // Should return true for AMD "Piledriver" and Intel "Ivy Bridge" processors + // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = {-1}; + __cpuid( CPUInfo, 0 ); + + if ( CPUInfo[0] < 1 ) + return false; + + __cpuid(CPUInfo, 1 ); + + // We check for F16C, AVX, OSXSAVE, and SSE4.1 + return ( (CPUInfo[2] & 0x38080000 ) == 0x38080000 ); +} + + +//------------------------------------------------------------------------------------- +// Data conversion +//------------------------------------------------------------------------------------- + +inline float XMConvertHalfToFloat( PackedVector::HALF Value ) +{ + __m128i V1 = _mm_cvtsi32_si128( static_cast(Value) ); + __m128 V2 = _mm_cvtph_ps( V1 ); + return _mm_cvtss_f32( V2 ); +} + +inline PackedVector::HALF XMConvertFloatToHalf( float Value ) +{ + __m128 V1 = _mm_set_ss( Value ); + __m128i V2 = _mm_cvtps_ph( V1, 0 ); + return static_cast( _mm_cvtsi128_si32(V2) ); +} + +inline float* XMConvertHalfToFloatStream +( + _Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(2+InputStride*(HalfCount-1)) const PackedVector::HALF* pInputStream, + _In_ size_t InputStride, + _In_ size_t HalfCount +) +{ + using namespace PackedVector; + + assert(pOutputStream); + assert(pInputStream); + const uint8_t* pHalf = reinterpret_cast(pInputStream); + uint8_t* pFloat = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = HalfCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(HALF)) + { + if (OutputStride == sizeof(float)) + { + if ( ((uintptr_t)pFloat & 0xF) == 0) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); + pHalf += InputStride*4; + + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_stream_ps( reinterpret_cast(pFloat), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); + pHalf += InputStride*4; + + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_storeu_ps( reinterpret_cast(pFloat), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); + pHalf += InputStride*4; + + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_store_ss( reinterpret_cast(pFloat), FV ); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 1 ); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 2 ); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 3 ); + pFloat += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(float)) + { + if ( ((uintptr_t)pFloat & 0xF) == 0) + { + // Scattered input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16( HV, H1, 0 ); + HV = _mm_insert_epi16( HV, H2, 1 ); + HV = _mm_insert_epi16( HV, H3, 2 ); + HV = _mm_insert_epi16( HV, H4, 3 ); + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_stream_ps( reinterpret_cast(pFloat ), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + else + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16( HV, H1, 0 ); + HV = _mm_insert_epi16( HV, H2, 1 ); + HV = _mm_insert_epi16( HV, H3, 2 ); + HV = _mm_insert_epi16( HV, H4, 3 ); + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_storeu_ps( reinterpret_cast(pFloat ), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + } + } + + for (; i < HalfCount; ++i) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; +} + + +inline PackedVector::HALF* XMConvertFloatToHalfStream +( + _Out_writes_bytes_(2+OutputStride*(FloatCount-1)) PackedVector::HALF* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, + _In_ size_t InputStride, + _In_ size_t FloatCount +) +{ + using namespace PackedVector; + + assert(pOutputStream); + assert(pInputStream); + const uint8_t* pFloat = reinterpret_cast(pInputStream); + uint8_t* pHalf = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = FloatCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(float)) + { + if (OutputStride == sizeof(HALF)) + { + if ( ((uintptr_t)pFloat & 0xF) == 0) + { + // Aligned and packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); + pHalf += OutputStride*4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); + pHalf += OutputStride*4; + i += 4; + } + } + } + else + { + if ( ((uintptr_t)pFloat & 0xF) == 0) + { + // Aligned & packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); + pHalf += OutputStride; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); + pHalf += OutputStride; + i += 4; + } + } + } + } + else if (OutputStride == sizeof(HALF)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 ); + __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 ); + FV = _mm_blend_ps( FV, FT, 0xC ); + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); + pHalf += OutputStride*4; + i += 4; + } + } + } + + for (; i < FloatCount; ++i) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + + return pOutputStream; +} + + +//------------------------------------------------------------------------------------- +// Half2 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMLoadHalf2( _In_ const PackedVector::XMHALF2* pSource ) +{ + assert(pSource); + __m128 V = _mm_load_ss( reinterpret_cast(pSource) ); + return _mm_cvtph_ps( _mm_castps_si128( V ) ); +} + +inline void XM_CALLCONV XMStoreHalf2( _Out_ PackedVector::XMHALF2* pDestination, _In_ FXMVECTOR V ) +{ + assert(pDestination); + __m128i V1 = _mm_cvtps_ph( V, 0 ); + _mm_store_ss( reinterpret_cast(pDestination), _mm_castsi128_ps(V1) ); +} + + +//------------------------------------------------------------------------------------- +// Half4 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMLoadHalf4( _In_ const PackedVector::XMHALF4* pSource ) +{ + assert(pSource); + __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); + return _mm_cvtph_ps( V ); +} + +inline void XM_CALLCONV XMStoreHalf4( _Out_ PackedVector::XMHALF4* pDestination, _In_ FXMVECTOR V ) +{ + assert(pDestination); + __m128i V1 = _mm_cvtps_ph( V, 0 ); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 ); +} + +}; // namespace F16C + }; // namespace DirectX; \ No newline at end of file diff --git a/Extensions/DirectXMathFMA3.h b/Extensions/DirectXMathFMA3.h index 6997d9d..5874014 100644 --- a/Extensions/DirectXMathFMA3.h +++ b/Extensions/DirectXMathFMA3.h @@ -1,405 +1,405 @@ -//------------------------------------------------------------------------------------- -// DirectXMathFMA3.h -- FMA3 extensions for SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#ifdef _MSC_VER -#pragma once -#endif - -#ifdef _M_ARM -#error FMA3 not supported on ARM platform -#endif - -#if defined(_MSC_VER) && (_MSC_VER < 1700) -#error FMA3 intrinsics requires Visual C++ 2012 or later. -#endif - -#pragma warning(push) -#pragma warning(disable : 4987) -#include -#pragma warning(pop) - -#include - -#include - -namespace DirectX -{ -#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) -#define XM_CALLCONV __fastcall -typedef const DirectX::XMVECTOR& HXMVECTOR; -typedef const DirectX::XMMATRIX& FXMMATRIX; -#endif - -namespace FMA3 -{ - -inline bool XMVerifyFMA3Support() -{ - // Should return true for AMD "Pildriver" and Intel "Haswell" processors - // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) - - // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx - int CPUInfo[4] = {-1}; - __cpuid( CPUInfo, 0 ); - - if ( CPUInfo[0] < 1 ) - return false; - - __cpuid(CPUInfo, 1 ); - - // We check for FMA3, AVX, OSXSAVE - return ( (CPUInfo[2] & 0x18001000) == 0x18001000 ); -} - - -//------------------------------------------------------------------------------------- -// Vector -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR V3 -) -{ - return _mm_fmadd_ps( V1, V2, V3 ); -} - -inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR V3 -) -{ - return _mm_fnmadd_ps( V1, V2, V3 ); -} - - -//------------------------------------------------------------------------------------- -// Vector2 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVector2Transform -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVector2TransformCoord -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); - vResult = _mm_div_ps( vResult, W ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVector2TransformNormal -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_mul_ps( vResult, M.r[1] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - return vResult; -} - - -//------------------------------------------------------------------------------------- -// Vector3 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVector3Transform -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z - vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); - vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVector3TransformCoord -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z - vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); - vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); - vResult = _mm_div_ps( vResult, W ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVector3TransformNormal -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z - vResult = _mm_mul_ps( vResult, M.r[2] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); - vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - return vResult; -} - -XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2); - -inline XMVECTOR XM_CALLCONV XMVector3Project -( - FXMVECTOR V, - float ViewportX, - float ViewportY, - float ViewportWidth, - float ViewportHeight, - float ViewportMinZ, - float ViewportMaxZ, - CXMMATRIX Projection, - CXMMATRIX View, - CXMMATRIX World -) -{ - const float HalfViewportWidth = ViewportWidth * 0.5f; - const float HalfViewportHeight = ViewportHeight * 0.5f; - - XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); - XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); - - XMMATRIX Transform = FMA3::XMMatrixMultiply(World, View); - Transform = FMA3::XMMatrixMultiply(Transform, Projection); - - XMVECTOR Result = FMA3::XMVector3TransformCoord(V, Transform); - - Result = FMA3::XMVectorMultiplyAdd(Result, Scale, Offset); - - return Result; -} - -inline XMVECTOR XM_CALLCONV XMVector3Unproject -( - FXMVECTOR V, - float ViewportX, - float ViewportY, - float ViewportWidth, - float ViewportHeight, - float ViewportMinZ, - float ViewportMaxZ, - CXMMATRIX Projection, - CXMMATRIX View, - CXMMATRIX World -) -{ - static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; - - XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); - Scale = XMVectorReciprocal(Scale); - - XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); - Offset = FMA3::XMVectorMultiplyAdd(Scale, Offset, D.v); - - XMMATRIX Transform = FMA3::XMMatrixMultiply(World, View); - Transform = FMA3::XMMatrixMultiply(Transform, Projection); - Transform = XMMatrixInverse(nullptr, Transform); - - XMVECTOR Result = FMA3::XMVectorMultiplyAdd(V, Scale, Offset); - - return FMA3::XMVector3TransformCoord(Result, Transform); -} - - -//------------------------------------------------------------------------------------- -// Vector4 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVector4Transform -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W - vResult = _mm_mul_ps( vResult, M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z - vResult = _mm_fmadd_ps( vTemp, M.r[2], vResult ); - vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); - vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); - return vResult; -} - - -//------------------------------------------------------------------------------------- -// Matrix -//------------------------------------------------------------------------------------- - -inline XMMATRIX XM_CALLCONV XMMatrixMultiply -( - CXMMATRIX M1, - CXMMATRIX M2 -) -{ - XMMATRIX mResult; - // Use vW to hold the original row - XMVECTOR vW = M1.r[0]; - // Splat the component X,Y,Z then W - XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - // Perform the operation on the first row - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - mResult.r[0] = vX; - // Repeat for the other 3 rows - vW = M1.r[1]; - vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - mResult.r[1] = vX; - vW = M1.r[2]; - vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - mResult.r[2] = vX; - vW = M1.r[3]; - vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - mResult.r[3] = vX; - return mResult; -} - -inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose -( - FXMMATRIX M1, - CXMMATRIX M2 -) -{ - // Use vW to hold the original row - XMVECTOR vW = M1.r[0]; - // Splat the component X,Y,Z then W - XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - // Perform the operation on the first row - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - __m128 r0 = vX; - // Repeat for the other 3 rows - vW = M1.r[1]; - vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - __m128 r1 = vX; - vW = M1.r[2]; - vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - __m128 r2 = vX; - vW = M1.r[3]; - vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_fmadd_ps(vY,M2.r[1],vX); - vX = _mm_fmadd_ps(vZ,M2.r[2],vX); - vX = _mm_fmadd_ps(vW,M2.r[3],vX); - __m128 r3 = vX; - - // x.x,x.y,y.x,y.y - XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); - // x.z,x.w,y.z,y.w - XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); - // z.x,z.y,w.x,w.y - XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); - // z.z,z.w,w.z,w.w - XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); - - XMMATRIX mResult; - // x.x,y.x,z.x,w.x - mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); - // x.y,y.y,z.y,w.y - mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); - // x.z,y.z,z.z,w.z - mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); - // x.w,y.w,z.w,w.w - mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); - return mResult; -} - -}; // namespace FMA3 - -}; // namespace DirectX; +//------------------------------------------------------------------------------------- +// DirectXMathFMA3.h -- FMA3 extensions for SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#ifdef _M_ARM +#error FMA3 not supported on ARM platform +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1700) +#error FMA3 intrinsics requires Visual C++ 2012 or later. +#endif + +#pragma warning(push) +#pragma warning(disable : 4987) +#include +#pragma warning(pop) + +#include + +#include + +namespace DirectX +{ +#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) +#define XM_CALLCONV __fastcall +typedef const DirectX::XMVECTOR& HXMVECTOR; +typedef const DirectX::XMMATRIX& FXMMATRIX; +#endif + +namespace FMA3 +{ + +inline bool XMVerifyFMA3Support() +{ + // Should return true for AMD "Pildriver" and Intel "Haswell" processors + // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = {-1}; + __cpuid( CPUInfo, 0 ); + + if ( CPUInfo[0] < 1 ) + return false; + + __cpuid(CPUInfo, 1 ); + + // We check for FMA3, AVX, OSXSAVE + return ( (CPUInfo[2] & 0x18001000) == 0x18001000 ); +} + + +//------------------------------------------------------------------------------------- +// Vector +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + return _mm_fmadd_ps( V1, V2, V3 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + return _mm_fnmadd_ps( V1, V2, V3 ); +} + + +//------------------------------------------------------------------------------------- +// Vector2 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector2Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector2TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vResult, M.r[1], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps( vResult, W ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector2TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_mul_ps( vResult, M.r[1] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Vector3 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector3Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector3TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_fmadd_ps( vResult, M.r[2], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps( vResult, W ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector3TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_mul_ps( vResult, M.r[2] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2); + +inline XMVECTOR XM_CALLCONV XMVector3Project +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = FMA3::XMMatrixMultiply(World, View); + Transform = FMA3::XMMatrixMultiply(Transform, Projection); + + XMVECTOR Result = FMA3::XMVector3TransformCoord(V, Transform); + + Result = FMA3::XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; +} + +inline XMVECTOR XM_CALLCONV XMVector3Unproject +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = FMA3::XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = FMA3::XMMatrixMultiply(World, View); + Transform = FMA3::XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + XMVECTOR Result = FMA3::XMVectorMultiplyAdd(V, Scale, Offset); + + return FMA3::XMVector3TransformCoord(Result, Transform); +} + + +//------------------------------------------------------------------------------------- +// Vector4 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector4Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W + vResult = _mm_mul_ps( vResult, M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_fmadd_ps( vTemp, M.r[2], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_fmadd_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_fmadd_ps( vTemp, M.r[0], vResult ); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Matrix +//------------------------------------------------------------------------------------- + +inline XMMATRIX XM_CALLCONV XMMatrixMultiply +( + CXMMATRIX M1, + CXMMATRIX M2 +) +{ + XMMATRIX mResult; + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[0] = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[1] = vX; + vW = M1.r[2]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[2] = vX; + vW = M1.r[3]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + mResult.r[3] = vX; + return mResult; +} + +inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose +( + FXMMATRIX M1, + CXMMATRIX M2 +) +{ + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r0 = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r1 = vX; + vW = M1.r[2]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r2 = vX; + vW = M1.r[3]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_fmadd_ps(vY,M2.r[1],vX); + vX = _mm_fmadd_ps(vZ,M2.r[2],vX); + vX = _mm_fmadd_ps(vW,M2.r[3],vX); + __m128 r3 = vX; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +} + +}; // namespace FMA3 + +}; // namespace DirectX; diff --git a/Extensions/DirectXMathFMA4.h b/Extensions/DirectXMathFMA4.h index 2e0cbc3..2a3e1d0 100644 --- a/Extensions/DirectXMathFMA4.h +++ b/Extensions/DirectXMathFMA4.h @@ -1,414 +1,414 @@ -//------------------------------------------------------------------------------------- -// DirectXMathFMA4.h -- FMA4 extensions for SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#ifdef _MSC_VER -#pragma once -#endif - -#ifdef _M_ARM -#error FMA4 not supported on ARM platform -#endif - -#if defined(_MSC_VER) && (_MSC_VER < 1600) -#error FMA4 intrinsics requires Visual C++ 2010 Service Pack 1 or later. -#endif - -#pragma warning(push) -#pragma warning(disable : 4987) -#include -#pragma warning(pop) - -#include - -#include - -namespace DirectX -{ -#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) -#define XM_CALLCONV __fastcall -typedef const DirectX::XMVECTOR& HXMVECTOR; -typedef const DirectX::XMMATRIX& FXMMATRIX; -#endif - -namespace FMA4 -{ - -inline bool XMVerifyFMA4Support() -{ - // Should return true for AMD Bulldozer processors - // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) - - // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx - int CPUInfo[4] = {-1}; - __cpuid( CPUInfo, 0 ); - - if ( CPUInfo[0] < 1 ) - return false; - - __cpuid(CPUInfo, 1 ); - - // We check for AVX, OSXSAVE (required to access FMA4) - if ( (CPUInfo[2] & 0x18000000) != 0x18000000 ) - return false; - - __cpuid( CPUInfo, 0x80000000 ); - - if ( CPUInfo[0] < 0x80000001 ) - return false; - - // We check for FMA4 - return ( CPUInfo[2] & 0x10000 ); -} - - -//------------------------------------------------------------------------------------- -// Vector -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR V3 -) -{ - return _mm_macc_ps( V1, V2, V3 ); -} - -inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR V3 -) -{ - return _mm_nmacc_ps( V1, V2, V3 ); -} - - -//------------------------------------------------------------------------------------- -// Vector2 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVector2Transform -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_macc_ps( vResult, M.r[1], M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVector2TransformCoord -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_macc_ps( vResult, M.r[1], M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); - XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); - vResult = _mm_div_ps( vResult, W ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVector2TransformNormal -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_mul_ps( vResult, M.r[1] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); - return vResult; -} - - -//------------------------------------------------------------------------------------- -// Vector3 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVector3Transform -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z - vResult = _mm_macc_ps( vResult, M.r[2], M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_macc_ps( vTemp, M.r[1], vResult ); - vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVector3TransformCoord -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z - vResult = _mm_macc_ps( vResult, M.r[2], M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_macc_ps( vTemp, M.r[1], vResult ); - vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); - XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); - vResult = _mm_div_ps( vResult, W ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVector3TransformNormal -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z - vResult = _mm_mul_ps( vResult, M.r[2] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_macc_ps( vTemp, M.r[1], vResult ); - vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); - return vResult; -} - -XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2); - -inline XMVECTOR XM_CALLCONV XMVector3Project -( - FXMVECTOR V, - float ViewportX, - float ViewportY, - float ViewportWidth, - float ViewportHeight, - float ViewportMinZ, - float ViewportMaxZ, - CXMMATRIX Projection, - CXMMATRIX View, - CXMMATRIX World -) -{ - const float HalfViewportWidth = ViewportWidth * 0.5f; - const float HalfViewportHeight = ViewportHeight * 0.5f; - - XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); - XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); - - XMMATRIX Transform = FMA4::XMMatrixMultiply(World, View); - Transform = FMA4::XMMatrixMultiply(Transform, Projection); - - XMVECTOR Result = FMA4::XMVector3TransformCoord(V, Transform); - - Result = FMA4::XMVectorMultiplyAdd(Result, Scale, Offset); - - return Result; -} - -inline XMVECTOR XM_CALLCONV XMVector3Unproject -( - FXMVECTOR V, - float ViewportX, - float ViewportY, - float ViewportWidth, - float ViewportHeight, - float ViewportMinZ, - float ViewportMaxZ, - CXMMATRIX Projection, - CXMMATRIX View, - CXMMATRIX World -) -{ - static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; - - XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); - Scale = XMVectorReciprocal(Scale); - - XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); - Offset = FMA4::XMVectorMultiplyAdd(Scale, Offset, D.v); - - XMMATRIX Transform = FMA4::XMMatrixMultiply(World, View); - Transform = FMA4::XMMatrixMultiply(Transform, Projection); - Transform = XMMatrixInverse(nullptr, Transform); - - XMVECTOR Result = FMA4::XMVectorMultiplyAdd(V, Scale, Offset); - - return FMA4::XMVector3TransformCoord(Result, Transform); -} - - -//------------------------------------------------------------------------------------- -// Vector4 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVector4Transform -( - FXMVECTOR V, - CXMMATRIX M -) -{ - XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W - vResult = _mm_mul_ps( vResult, M.r[3] ); - XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z - vResult = _mm_macc_ps( vTemp, M.r[2], vResult ); - vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y - vResult = _mm_macc_ps( vTemp, M.r[1], vResult ); - vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X - vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); - return vResult; -} - - -//------------------------------------------------------------------------------------- -// Matrix -//------------------------------------------------------------------------------------- - -inline XMMATRIX XM_CALLCONV XMMatrixMultiply -( - CXMMATRIX M1, - CXMMATRIX M2 -) -{ - XMMATRIX mResult; - // Use vW to hold the original row - XMVECTOR vW = M1.r[0]; - // Splat the component X,Y,Z then W - XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - // Perform the operation on the first row - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_macc_ps(vY,M2.r[1],vX); - vX = _mm_macc_ps(vZ,M2.r[2],vX); - vX = _mm_macc_ps(vW,M2.r[3],vX); - mResult.r[0] = vX; - // Repeat for the other 3 rows - vW = M1.r[1]; - vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_macc_ps(vY,M2.r[1],vX); - vX = _mm_macc_ps(vZ,M2.r[2],vX); - vX = _mm_macc_ps(vW,M2.r[3],vX); - mResult.r[1] = vX; - vW = M1.r[2]; - vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_macc_ps(vY,M2.r[1],vX); - vX = _mm_macc_ps(vZ,M2.r[2],vX); - vX = _mm_macc_ps(vW,M2.r[3],vX); - mResult.r[2] = vX; - vW = M1.r[3]; - vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_macc_ps(vY,M2.r[1],vX); - vX = _mm_macc_ps(vZ,M2.r[2],vX); - vX = _mm_macc_ps(vW,M2.r[3],vX); - mResult.r[3] = vX; - return mResult; -} - -inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose -( - FXMMATRIX M1, - CXMMATRIX M2 -) -{ - // Use vW to hold the original row - XMVECTOR vW = M1.r[0]; - // Splat the component X,Y,Z then W - XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - // Perform the operation on the first row - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_macc_ps(vY,M2.r[1],vX); - vX = _mm_macc_ps(vZ,M2.r[2],vX); - vX = _mm_macc_ps(vW,M2.r[3],vX); - __m128 r0 = vX; - // Repeat for the other 3 rows - vW = M1.r[1]; - vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_macc_ps(vY,M2.r[1],vX); - vX = _mm_macc_ps(vZ,M2.r[2],vX); - vX = _mm_macc_ps(vW,M2.r[3],vX); - __m128 r1 = vX; - vW = M1.r[2]; - vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_macc_ps(vY,M2.r[1],vX); - vX = _mm_macc_ps(vZ,M2.r[2],vX); - vX = _mm_macc_ps(vW,M2.r[3],vX); - __m128 r2 = vX; - vW = M1.r[3]; - vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); - vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); - vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vX = _mm_macc_ps(vY,M2.r[1],vX); - vX = _mm_macc_ps(vZ,M2.r[2],vX); - vX = _mm_macc_ps(vW,M2.r[3],vX); - __m128 r3 = vX; - - // x.x,x.y,y.x,y.y - XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); - // x.z,x.w,y.z,y.w - XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); - // z.x,z.y,w.x,w.y - XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); - // z.z,z.w,w.z,w.w - XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); - - XMMATRIX mResult; - // x.x,y.x,z.x,w.x - mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); - // x.y,y.y,z.y,w.y - mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); - // x.z,y.z,z.z,w.z - mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); - // x.w,y.w,z.w,w.w - mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); - return mResult; -} - -}; // namespace FMA4 - -}; // namespace DirectX; +//------------------------------------------------------------------------------------- +// DirectXMathFMA4.h -- FMA4 extensions for SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#ifdef _M_ARM +#error FMA4 not supported on ARM platform +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1600) +#error FMA4 intrinsics requires Visual C++ 2010 Service Pack 1 or later. +#endif + +#pragma warning(push) +#pragma warning(disable : 4987) +#include +#pragma warning(pop) + +#include + +#include + +namespace DirectX +{ +#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) +#define XM_CALLCONV __fastcall +typedef const DirectX::XMVECTOR& HXMVECTOR; +typedef const DirectX::XMMATRIX& FXMMATRIX; +#endif + +namespace FMA4 +{ + +inline bool XMVerifyFMA4Support() +{ + // Should return true for AMD Bulldozer processors + // with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012) + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = {-1}; + __cpuid( CPUInfo, 0 ); + + if ( CPUInfo[0] < 1 ) + return false; + + __cpuid(CPUInfo, 1 ); + + // We check for AVX, OSXSAVE (required to access FMA4) + if ( (CPUInfo[2] & 0x18000000) != 0x18000000 ) + return false; + + __cpuid( CPUInfo, 0x80000000 ); + + if ( CPUInfo[0] < 0x80000001 ) + return false; + + // We check for FMA4 + return ( CPUInfo[2] & 0x10000 ); +} + + +//------------------------------------------------------------------------------------- +// Vector +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + return _mm_macc_ps( V1, V2, V3 ); +} + +inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + return _mm_nmacc_ps( V1, V2, V3 ); +} + + +//------------------------------------------------------------------------------------- +// Vector2 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector2Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_macc_ps( vResult, M.r[1], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector2TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_macc_ps( vResult, M.r[1], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps( vResult, W ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector2TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_mul_ps( vResult, M.r[1] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Vector3 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector3Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_macc_ps( vResult, M.r[2], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_macc_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector3TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_macc_ps( vResult, M.r[2], M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_macc_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + XMVECTOR W = _mm_permute_ps(vResult,_MM_SHUFFLE(3,3,3,3)); + vResult = _mm_div_ps( vResult, W ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVector3TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_mul_ps( vResult, M.r[2] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_macc_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + return vResult; +} + +XMMATRIX XM_CALLCONV XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2); + +inline XMVECTOR XM_CALLCONV XMVector3Project +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = FMA4::XMMatrixMultiply(World, View); + Transform = FMA4::XMMatrixMultiply(Transform, Projection); + + XMVECTOR Result = FMA4::XMVector3TransformCoord(V, Transform); + + Result = FMA4::XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; +} + +inline XMVECTOR XM_CALLCONV XMVector3Unproject +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = FMA4::XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = FMA4::XMMatrixMultiply(World, View); + Transform = FMA4::XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + XMVECTOR Result = FMA4::XMVectorMultiplyAdd(V, Scale, Offset); + + return FMA4::XMVector3TransformCoord(Result, Transform); +} + + +//------------------------------------------------------------------------------------- +// Vector4 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector4Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ + XMVECTOR vResult = _mm_permute_ps(V,_MM_SHUFFLE(3,3,3,3)); // W + vResult = _mm_mul_ps( vResult, M.r[3] ); + XMVECTOR vTemp = _mm_permute_ps(V,_MM_SHUFFLE(2,2,2,2)); // Z + vResult = _mm_macc_ps( vTemp, M.r[2], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(1,1,1,1)); // Y + vResult = _mm_macc_ps( vTemp, M.r[1], vResult ); + vTemp = _mm_permute_ps(V,_MM_SHUFFLE(0,0,0,0)); // X + vResult = _mm_macc_ps( vTemp, M.r[0], vResult ); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Matrix +//------------------------------------------------------------------------------------- + +inline XMMATRIX XM_CALLCONV XMMatrixMultiply +( + CXMMATRIX M1, + CXMMATRIX M2 +) +{ + XMMATRIX mResult; + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + mResult.r[0] = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + mResult.r[1] = vX; + vW = M1.r[2]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + mResult.r[2] = vX; + vW = M1.r[3]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + mResult.r[3] = vX; + return mResult; +} + +inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose +( + FXMMATRIX M1, + CXMMATRIX M2 +) +{ + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + __m128 r0 = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + __m128 r1 = vX; + vW = M1.r[2]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + __m128 r2 = vX; + vW = M1.r[3]; + vX = _mm_permute_ps(vW,_MM_SHUFFLE(0,0,0,0)); + vY = _mm_permute_ps(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = _mm_permute_ps(vW,_MM_SHUFFLE(2,2,2,2)); + vW = _mm_permute_ps(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vX = _mm_macc_ps(vY,M2.r[1],vX); + vX = _mm_macc_ps(vZ,M2.r[2],vX); + vX = _mm_macc_ps(vW,M2.r[3],vX); + __m128 r3 = vX; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +} + +}; // namespace FMA4 + +}; // namespace DirectX; diff --git a/Extensions/DirectXMathSSE3.h b/Extensions/DirectXMathSSE3.h index c61dde8..9d3911b 100644 --- a/Extensions/DirectXMathSSE3.h +++ b/Extensions/DirectXMathSSE3.h @@ -1,120 +1,120 @@ -//------------------------------------------------------------------------------------- -// DirectXMathSSE3.h -- SSE3 extensions for SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#ifdef _MSC_VER -#pragma once -#endif - -#ifdef _M_ARM -#error SSE3 not supported on ARM platform -#endif - -#pragma warning(push) -#pragma warning(disable : 4987) -#include -#pragma warning(pop) - -#include - -#include - -namespace DirectX -{ -#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) -#define XM_CALLCONV __fastcall -typedef const DirectX::XMVECTOR& HXMVECTOR; -typedef const DirectX::XMMATRIX& FXMMATRIX; -#endif - -namespace SSE3 -{ - -inline bool XMVerifySSE3Support() -{ - // Should return true on AMD Athlon 64, AMD Phenom, and Intel Pentium 4 or later processors - - // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx - int CPUInfo[4] = {-1}; - __cpuid( CPUInfo, 0 ); - - if ( CPUInfo[0] < 1 ) - return false; - - __cpuid(CPUInfo, 1 ); - - // We only check for SSE3 instruction set. SSSE3 instructions are not used. - return ( (CPUInfo[2] & 0x1) != 0 ); -} - -inline XMVECTOR XM_CALLCONV XMVector2Dot -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - XMVECTOR vTemp = _mm_mul_ps(V1,V2); - vTemp = _mm_hadd_ps(vTemp,vTemp); - return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(0,0,0,0)); -} - -inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V ) -{ - return SSE3::XMVector2Dot(V, V); -} - -inline XMVECTOR XM_CALLCONV XMVector3Dot -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - XMVECTOR vTemp = _mm_mul_ps(V1,V2); - vTemp = _mm_and_ps( vTemp, g_XMMask3 ); - vTemp = _mm_hadd_ps(vTemp,vTemp); - return _mm_hadd_ps(vTemp,vTemp); -} - -inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V ) -{ - return SSE3::XMVector3Dot(V, V); -} - -inline XMVECTOR XM_CALLCONV XMVector4Dot -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - XMVECTOR vTemp = _mm_mul_ps(V1,V2); - vTemp = _mm_hadd_ps( vTemp, vTemp ); - return _mm_hadd_ps( vTemp, vTemp ); -} - -inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V ) -{ - return SSE3::XMVector4Dot(V, V); -} - -inline XMVECTOR XM_CALLCONV XMVectorSwizzle_0022( FXMVECTOR V ) -{ - return _mm_moveldup_ps(V); -} - -inline XMVECTOR XM_CALLCONV XMVectorSwizzle_1133( FXMVECTOR V ) -{ - return _mm_movehdup_ps(V); -} - -}; // namespace SSE3 - +//------------------------------------------------------------------------------------- +// DirectXMathSSE3.h -- SSE3 extensions for SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#ifdef _M_ARM +#error SSE3 not supported on ARM platform +#endif + +#pragma warning(push) +#pragma warning(disable : 4987) +#include +#pragma warning(pop) + +#include + +#include + +namespace DirectX +{ +#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) +#define XM_CALLCONV __fastcall +typedef const DirectX::XMVECTOR& HXMVECTOR; +typedef const DirectX::XMMATRIX& FXMMATRIX; +#endif + +namespace SSE3 +{ + +inline bool XMVerifySSE3Support() +{ + // Should return true on AMD Athlon 64, AMD Phenom, and Intel Pentium 4 or later processors + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = {-1}; + __cpuid( CPUInfo, 0 ); + + if ( CPUInfo[0] < 1 ) + return false; + + __cpuid(CPUInfo, 1 ); + + // We only check for SSE3 instruction set. SSSE3 instructions are not used. + return ( (CPUInfo[2] & 0x1) != 0 ); +} + +inline XMVECTOR XM_CALLCONV XMVector2Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + XMVECTOR vTemp = _mm_mul_ps(V1,V2); + vTemp = _mm_hadd_ps(vTemp,vTemp); + return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(0,0,0,0)); +} + +inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V ) +{ + return SSE3::XMVector2Dot(V, V); +} + +inline XMVECTOR XM_CALLCONV XMVector3Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + XMVECTOR vTemp = _mm_mul_ps(V1,V2); + vTemp = _mm_and_ps( vTemp, g_XMMask3 ); + vTemp = _mm_hadd_ps(vTemp,vTemp); + return _mm_hadd_ps(vTemp,vTemp); +} + +inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V ) +{ + return SSE3::XMVector3Dot(V, V); +} + +inline XMVECTOR XM_CALLCONV XMVector4Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + XMVECTOR vTemp = _mm_mul_ps(V1,V2); + vTemp = _mm_hadd_ps( vTemp, vTemp ); + return _mm_hadd_ps( vTemp, vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V ) +{ + return SSE3::XMVector4Dot(V, V); +} + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle_0022( FXMVECTOR V ) +{ + return _mm_moveldup_ps(V); +} + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle_1133( FXMVECTOR V ) +{ + return _mm_movehdup_ps(V); +} + +}; // namespace SSE3 + }; // namespace DirectX; \ No newline at end of file diff --git a/Extensions/DirectXMathSSE4.h b/Extensions/DirectXMathSSE4.h index 8495626..6f10dc1 100644 --- a/Extensions/DirectXMathSSE4.h +++ b/Extensions/DirectXMathSSE4.h @@ -1,422 +1,422 @@ -//------------------------------------------------------------------------------------- -// DirectXMathSSE4.h -- SSE4.1 extensions for SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#ifdef _MSC_VER -#pragma once -#endif - -#ifdef _M_ARM -#error SSE4 not supported on ARM platform -#endif - -#pragma warning(push) -#pragma warning(disable : 4987) -#include -#pragma warning(pop) - -#include - -#include - -namespace DirectX -{ -#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) -#define XM_CALLCONV __fastcall -typedef const DirectX::XMVECTOR& HXMVECTOR; -typedef const DirectX::XMMATRIX& FXMMATRIX; -#endif - -namespace SSE4 -{ - -inline bool XMVerifySSE4Support() -{ - // Should return true on AMD Bulldozer, Intel Core 2 ("Penryn"), and Intel Core i7 ("Nehalem") or later processors - - // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx - int CPUInfo[4] = {-1}; - __cpuid( CPUInfo, 0 ); - - if ( CPUInfo[0] < 1 ) - return false; - - __cpuid(CPUInfo, 1 ); - - // We only check for SSE4.1 instruction set. SSE4.2 instructions are not used. - return ( (CPUInfo[2] & 0x80000) == 0x80000 ); -} - - -//------------------------------------------------------------------------------------- -// Vector -//------------------------------------------------------------------------------------- - -inline void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V) -{ - assert( y != nullptr ); - *((int*)y) = _mm_extract_ps( V, 1 ); -} - -inline void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V) -{ - assert( z != nullptr ); - *((int*)z) = _mm_extract_ps( V, 2 ); -} - -inline void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V) -{ - assert( w != nullptr ); - *((int*)w) = _mm_extract_ps( V, 3 ); -} - -inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) -{ - __m128i V1 = _mm_castps_si128( V ); - return static_cast( _mm_extract_epi32( V1, 1 ) ); -} - -inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) -{ - __m128i V1 = _mm_castps_si128( V ); - return static_cast( _mm_extract_epi32( V1, 2 ) ); -} - -inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) -{ - __m128i V1 = _mm_castps_si128( V ); - return static_cast( _mm_extract_epi32( V1, 3 ) ); -} - -inline void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V) -{ - assert( y != nullptr ); - __m128i V1 = _mm_castps_si128( V ); - *y = static_cast( _mm_extract_epi32( V1, 1 ) ); -} - -inline void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V) -{ - assert( z != nullptr ); - __m128i V1 = _mm_castps_si128( V ); - *z = static_cast( _mm_extract_epi32( V1, 2 ) ); -} - -inline void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V) -{ - assert( w != nullptr ); - __m128i V1 = _mm_castps_si128( V ); - *w = static_cast( _mm_extract_epi32( V1, 3 ) ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) -{ - XMVECTOR vResult = _mm_set_ss(y); - vResult = _mm_insert_ps( V, vResult, 0x10 ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) -{ - XMVECTOR vResult = _mm_set_ss(z); - vResult = _mm_insert_ps( V, vResult, 0x20 ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) -{ - XMVECTOR vResult = _mm_set_ss(w); - vResult = _mm_insert_ps( V, vResult, 0x30 ); - return vResult; -} - -inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) -{ - __m128i vResult = _mm_castps_si128( V ); - vResult = _mm_insert_epi32( vResult, static_cast(y), 1 ); - return _mm_castsi128_ps( vResult ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) -{ - __m128i vResult = _mm_castps_si128( V ); - vResult = _mm_insert_epi32( vResult, static_cast(z), 2 ); - return _mm_castsi128_ps( vResult ); -} - -inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) -{ - __m128i vResult = _mm_castps_si128( V ); - vResult = _mm_insert_epi32( vResult, static_cast(w), 3 ); - return _mm_castsi128_ps( vResult ); -} - -inline XMVECTOR XM_CALLCONV XMVectorRound( FXMVECTOR V ) -{ - return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ); -} - -inline XMVECTOR XM_CALLCONV XMVectorTruncate( FXMVECTOR V ) -{ - return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ); -} - -inline XMVECTOR XM_CALLCONV XMVectorFloor( FXMVECTOR V ) -{ - return _mm_floor_ps( V ); -} - -inline XMVECTOR XM_CALLCONV XMVectorCeiling( FXMVECTOR V ) -{ - return _mm_ceil_ps( V ); -} - - -//------------------------------------------------------------------------------------- -// Vector2 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVector2Dot( FXMVECTOR V1, FXMVECTOR V2 ) -{ - return _mm_dp_ps( V1, V2, 0x3f ); -} - -inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V ) -{ - return SSE4::XMVector2Dot(V, V); -} - -inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); - return _mm_rsqrt_ps( vTemp ); -} - -inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); - XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); - return _mm_div_ps( g_XMOne, vLengthSq ); -} - -inline XMVECTOR XM_CALLCONV XMVector2LengthEst( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); - return _mm_sqrt_ps( vTemp ); -} - -inline XMVECTOR XM_CALLCONV XMVector2Length( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); - return _mm_sqrt_ps( vTemp ); -} - -inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); - XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); - return _mm_mul_ps(vResult, V); -} - -inline XMVECTOR XM_CALLCONV XMVector2Normalize( FXMVECTOR V ) -{ - XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f ); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Create zero with a single instruction - XMVECTOR vZeroMask = _mm_setzero_ps(); - // Test for a divide by zero (Must be FP to detect -0.0) - vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Reciprocal mul to perform the normalization - vResult = _mm_div_ps(V,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vZeroMask); - // Select qnan or result based on infinite length - XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); - XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); - vResult = _mm_or_ps(vTemp1,vTemp2); - return vResult; -} - - -//------------------------------------------------------------------------------------- -// Vector3 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVector3Dot( FXMVECTOR V1, FXMVECTOR V2 ) -{ - return _mm_dp_ps( V1, V2, 0x7f ); -} - -inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V ) -{ - return SSE4::XMVector3Dot(V, V); -} - -inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); - return _mm_rsqrt_ps( vTemp ); -} - -inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); - XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); - return _mm_div_ps( g_XMOne, vLengthSq ); -} - -inline XMVECTOR XM_CALLCONV XMVector3LengthEst( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); - return _mm_sqrt_ps( vTemp ); -} - -inline XMVECTOR XM_CALLCONV XMVector3Length( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); - return _mm_sqrt_ps( vTemp ); -} - -inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); - XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); - return _mm_mul_ps(vResult, V); -} - -inline XMVECTOR XM_CALLCONV XMVector3Normalize( FXMVECTOR V ) -{ - XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f ); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Create zero with a single instruction - XMVECTOR vZeroMask = _mm_setzero_ps(); - // Test for a divide by zero (Must be FP to detect -0.0) - vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Divide to perform the normalization - vResult = _mm_div_ps(V,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vZeroMask); - // Select qnan or result based on infinite length - XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); - XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); - vResult = _mm_or_ps(vTemp1,vTemp2); - return vResult; -} - - -//------------------------------------------------------------------------------------- -// Vector4 -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMVector4Dot( FXMVECTOR V1, FXMVECTOR V2 ) -{ - return _mm_dp_ps( V1, V2, 0xff ); -} - -inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V ) -{ - return SSE4::XMVector4Dot(V, V); -} - -inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); - return _mm_rsqrt_ps( vTemp ); -} - -inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); - XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); - return _mm_div_ps( g_XMOne, vLengthSq ); -} - -inline XMVECTOR XM_CALLCONV XMVector4LengthEst( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); - return _mm_sqrt_ps( vTemp ); -} - -inline XMVECTOR XM_CALLCONV XMVector4Length( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); - return _mm_sqrt_ps( vTemp ); -} - -inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst( FXMVECTOR V ) -{ - XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); - XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); - return _mm_mul_ps(vResult, V); -} - -inline XMVECTOR XM_CALLCONV XMVector4Normalize( FXMVECTOR V ) -{ - XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff ); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Create zero with a single instruction - XMVECTOR vZeroMask = _mm_setzero_ps(); - // Test for a divide by zero (Must be FP to detect -0.0) - vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Divide to perform the normalization - vResult = _mm_div_ps(V,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vZeroMask); - // Select qnan or result based on infinite length - XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); - XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); - vResult = _mm_or_ps(vTemp1,vTemp2); - return vResult; -} - - -//------------------------------------------------------------------------------------- -// Plane -//------------------------------------------------------------------------------------- - -inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst( FXMVECTOR P ) -{ - XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f ); - XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); - return _mm_mul_ps(vResult, P); -} - -inline XMVECTOR XM_CALLCONV XMPlaneNormalize( FXMVECTOR P ) -{ - XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f ); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Reciprocal mul to perform the normalization - vResult = _mm_div_ps(P,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vLengthSq); - return vResult; -} - -}; // namespace SSE4 - +//------------------------------------------------------------------------------------- +// DirectXMathSSE4.h -- SSE4.1 extensions for SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#ifdef _M_ARM +#error SSE4 not supported on ARM platform +#endif + +#pragma warning(push) +#pragma warning(disable : 4987) +#include +#pragma warning(pop) + +#include + +#include + +namespace DirectX +{ +#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) +#define XM_CALLCONV __fastcall +typedef const DirectX::XMVECTOR& HXMVECTOR; +typedef const DirectX::XMMATRIX& FXMMATRIX; +#endif + +namespace SSE4 +{ + +inline bool XMVerifySSE4Support() +{ + // Should return true on AMD Bulldozer, Intel Core 2 ("Penryn"), and Intel Core i7 ("Nehalem") or later processors + + // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + int CPUInfo[4] = {-1}; + __cpuid( CPUInfo, 0 ); + + if ( CPUInfo[0] < 1 ) + return false; + + __cpuid(CPUInfo, 1 ); + + // We only check for SSE4.1 instruction set. SSE4.2 instructions are not used. + return ( (CPUInfo[2] & 0x80000) == 0x80000 ); +} + + +//------------------------------------------------------------------------------------- +// Vector +//------------------------------------------------------------------------------------- + +inline void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V) +{ + assert( y != nullptr ); + *((int*)y) = _mm_extract_ps( V, 1 ); +} + +inline void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V) +{ + assert( z != nullptr ); + *((int*)z) = _mm_extract_ps( V, 2 ); +} + +inline void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V) +{ + assert( w != nullptr ); + *((int*)w) = _mm_extract_ps( V, 3 ); +} + +inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) +{ + __m128i V1 = _mm_castps_si128( V ); + return static_cast( _mm_extract_epi32( V1, 1 ) ); +} + +inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) +{ + __m128i V1 = _mm_castps_si128( V ); + return static_cast( _mm_extract_epi32( V1, 2 ) ); +} + +inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) +{ + __m128i V1 = _mm_castps_si128( V ); + return static_cast( _mm_extract_epi32( V1, 3 ) ); +} + +inline void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V) +{ + assert( y != nullptr ); + __m128i V1 = _mm_castps_si128( V ); + *y = static_cast( _mm_extract_epi32( V1, 1 ) ); +} + +inline void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V) +{ + assert( z != nullptr ); + __m128i V1 = _mm_castps_si128( V ); + *z = static_cast( _mm_extract_epi32( V1, 2 ) ); +} + +inline void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V) +{ + assert( w != nullptr ); + __m128i V1 = _mm_castps_si128( V ); + *w = static_cast( _mm_extract_epi32( V1, 3 ) ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) +{ + XMVECTOR vResult = _mm_set_ss(y); + vResult = _mm_insert_ps( V, vResult, 0x10 ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) +{ + XMVECTOR vResult = _mm_set_ss(z); + vResult = _mm_insert_ps( V, vResult, 0x20 ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) +{ + XMVECTOR vResult = _mm_set_ss(w); + vResult = _mm_insert_ps( V, vResult, 0x30 ); + return vResult; +} + +inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) +{ + __m128i vResult = _mm_castps_si128( V ); + vResult = _mm_insert_epi32( vResult, static_cast(y), 1 ); + return _mm_castsi128_ps( vResult ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) +{ + __m128i vResult = _mm_castps_si128( V ); + vResult = _mm_insert_epi32( vResult, static_cast(z), 2 ); + return _mm_castsi128_ps( vResult ); +} + +inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) +{ + __m128i vResult = _mm_castps_si128( V ); + vResult = _mm_insert_epi32( vResult, static_cast(w), 3 ); + return _mm_castsi128_ps( vResult ); +} + +inline XMVECTOR XM_CALLCONV XMVectorRound( FXMVECTOR V ) +{ + return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ); +} + +inline XMVECTOR XM_CALLCONV XMVectorTruncate( FXMVECTOR V ) +{ + return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ); +} + +inline XMVECTOR XM_CALLCONV XMVectorFloor( FXMVECTOR V ) +{ + return _mm_floor_ps( V ); +} + +inline XMVECTOR XM_CALLCONV XMVectorCeiling( FXMVECTOR V ) +{ + return _mm_ceil_ps( V ); +} + + +//------------------------------------------------------------------------------------- +// Vector2 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector2Dot( FXMVECTOR V1, FXMVECTOR V2 ) +{ + return _mm_dp_ps( V1, V2, 0x3f ); +} + +inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V ) +{ + return SSE4::XMVector2Dot(V, V); +} + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + return _mm_rsqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); + return _mm_div_ps( g_XMOne, vLengthSq ); +} + +inline XMVECTOR XM_CALLCONV XMVector2LengthEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + return _mm_sqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector2Length( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + return _mm_sqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, V); +} + +inline XMVECTOR XM_CALLCONV XMVector2Normalize( FXMVECTOR V ) +{ + XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Vector3 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector3Dot( FXMVECTOR V1, FXMVECTOR V2 ) +{ + return _mm_dp_ps( V1, V2, 0x7f ); +} + +inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V ) +{ + return SSE4::XMVector3Dot(V, V); +} + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + return _mm_rsqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); + return _mm_div_ps( g_XMOne, vLengthSq ); +} + +inline XMVECTOR XM_CALLCONV XMVector3LengthEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + return _mm_sqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector3Length( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + return _mm_sqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, V); +} + +inline XMVECTOR XM_CALLCONV XMVector3Normalize( FXMVECTOR V ) +{ + XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Vector4 +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMVector4Dot( FXMVECTOR V1, FXMVECTOR V2 ) +{ + return _mm_dp_ps( V1, V2, 0xff ); +} + +inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V ) +{ + return SSE4::XMVector4Dot(V, V); +} + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + return _mm_rsqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); + return _mm_div_ps( g_XMOne, vLengthSq ); +} + +inline XMVECTOR XM_CALLCONV XMVector4LengthEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + return _mm_sqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector4Length( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + return _mm_sqrt_ps( vTemp ); +} + +inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst( FXMVECTOR V ) +{ + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, V); +} + +inline XMVECTOR XM_CALLCONV XMVector4Normalize( FXMVECTOR V ) +{ + XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +} + + +//------------------------------------------------------------------------------------- +// Plane +//------------------------------------------------------------------------------------- + +inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst( FXMVECTOR P ) +{ + XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, P); +} + +inline XMVECTOR XM_CALLCONV XMPlaneNormalize( FXMVECTOR P ) +{ + XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vLengthSq); + return vResult; +} + +}; // namespace SSE4 + }; // namespace DirectX; \ No newline at end of file diff --git a/Inc/DirectXCollision.h b/Inc/DirectXCollision.h index a6e341c..8b51516 100644 --- a/Inc/DirectXCollision.h +++ b/Inc/DirectXCollision.h @@ -1,341 +1,341 @@ -//------------------------------------------------------------------------------------- -// DirectXCollision.h -- C++ Collision Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#pragma once - -#include "DirectXMath.h" - -namespace DirectX -{ - -enum ContainmentType -{ - DISJOINT = 0, - INTERSECTS = 1, - CONTAINS = 2, -}; - -enum PlaneIntersectionType -{ - FRONT = 0, - INTERSECTING = 1, - BACK = 2, -}; - -struct BoundingBox; -struct BoundingOrientedBox; -struct BoundingFrustum; - -#pragma warning(push) -#pragma warning(disable:4324 4820) -// C4324: alignment padding warnings -// C4820: Off by default noise - -//------------------------------------------------------------------------------------- -// Bounding sphere -//------------------------------------------------------------------------------------- -struct BoundingSphere -{ - XMFLOAT3 Center; // Center of the sphere. - float Radius; // Radius of the sphere. - - // Creators - BoundingSphere() : Center(0,0,0), Radius( 1.f ) {} - XM_CONSTEXPR BoundingSphere( _In_ const XMFLOAT3& center, _In_ float radius ) - : Center(center), Radius(radius) {} - BoundingSphere( _In_ const BoundingSphere& sp ) - : Center(sp.Center), Radius(sp.Radius) {} - - // Methods - BoundingSphere& operator=( _In_ const BoundingSphere& sp ) { Center = sp.Center; Radius = sp.Radius; return *this; } - - void XM_CALLCONV Transform( _Out_ BoundingSphere& Out, _In_ FXMMATRIX M ) const; - void XM_CALLCONV Transform( _Out_ BoundingSphere& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; - // Transform the sphere - - ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR Point ) const; - ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; - ContainmentType Contains( _In_ const BoundingSphere& sh ) const; - ContainmentType Contains( _In_ const BoundingBox& box ) const; - ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; - ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; - - bool Intersects( _In_ const BoundingSphere& sh ) const; - bool Intersects( _In_ const BoundingBox& box ) const; - bool Intersects( _In_ const BoundingOrientedBox& box ) const; - bool Intersects( _In_ const BoundingFrustum& fr ) const; - - bool XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; - // Triangle-sphere test - - PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR Plane ) const; - // Plane-sphere test - - bool XM_CALLCONV Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; - // Ray-sphere test - - ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, - _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const; - // Test sphere against six planes (see BoundingFrustum::GetPlanes) - - // Static methods - static void CreateMerged( _Out_ BoundingSphere& Out, _In_ const BoundingSphere& S1, _In_ const BoundingSphere& S2 ); - - static void CreateFromBoundingBox( _Out_ BoundingSphere& Out, _In_ const BoundingBox& box ); - static void CreateFromBoundingBox( _Out_ BoundingSphere& Out, _In_ const BoundingOrientedBox& box ); - - static void CreateFromPoints( _Out_ BoundingSphere& Out, _In_ size_t Count, - _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride ); - - static void CreateFromFrustum( _Out_ BoundingSphere& Out, _In_ const BoundingFrustum& fr ); -}; - -//------------------------------------------------------------------------------------- -// Axis-aligned bounding box -//------------------------------------------------------------------------------------- -struct BoundingBox -{ - static const size_t CORNER_COUNT = 8; - - XMFLOAT3 Center; // Center of the box. - XMFLOAT3 Extents; // Distance from the center to each side. - - // Creators - BoundingBox() : Center(0,0,0), Extents( 1.f, 1.f, 1.f ) {} - XM_CONSTEXPR BoundingBox( _In_ const XMFLOAT3& center, _In_ const XMFLOAT3& extents ) - : Center(center), Extents(extents) {} - BoundingBox( _In_ const BoundingBox& box ) : Center(box.Center), Extents(box.Extents) {} - - // Methods - BoundingBox& operator=( _In_ const BoundingBox& box) { Center = box.Center; Extents = box.Extents; return *this; } - - void XM_CALLCONV Transform( _Out_ BoundingBox& Out, _In_ FXMMATRIX M ) const; - void XM_CALLCONV Transform( _Out_ BoundingBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; - - void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const; - // Gets the 8 corners of the box - - ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR Point ) const; - ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; - ContainmentType Contains( _In_ const BoundingSphere& sh ) const; - ContainmentType Contains( _In_ const BoundingBox& box ) const; - ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; - ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; - - bool Intersects( _In_ const BoundingSphere& sh ) const; - bool Intersects( _In_ const BoundingBox& box ) const; - bool Intersects( _In_ const BoundingOrientedBox& box ) const; - bool Intersects( _In_ const BoundingFrustum& fr ) const; - - bool XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; - // Triangle-Box test - - PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR Plane ) const; - // Plane-box test - - bool XM_CALLCONV Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; - // Ray-Box test - - ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, - _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const; - // Test box against six planes (see BoundingFrustum::GetPlanes) - - // Static methods - static void CreateMerged( _Out_ BoundingBox& Out, _In_ const BoundingBox& b1, _In_ const BoundingBox& b2 ); - - static void CreateFromSphere( _Out_ BoundingBox& Out, _In_ const BoundingSphere& sh ); - - static void XM_CALLCONV CreateFromPoints( _Out_ BoundingBox& Out, _In_ FXMVECTOR pt1, _In_ FXMVECTOR pt2 ); - static void CreateFromPoints( _Out_ BoundingBox& Out, _In_ size_t Count, - _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride ); -}; - -//------------------------------------------------------------------------------------- -// Oriented bounding box -//------------------------------------------------------------------------------------- -struct BoundingOrientedBox -{ - static const size_t CORNER_COUNT = 8; - - XMFLOAT3 Center; // Center of the box. - XMFLOAT3 Extents; // Distance from the center to each side. - XMFLOAT4 Orientation; // Unit quaternion representing rotation (box -> world). - - // Creators - BoundingOrientedBox() : Center(0,0,0), Extents( 1.f, 1.f, 1.f ), Orientation(0,0,0, 1.f ) {} - XM_CONSTEXPR BoundingOrientedBox( _In_ const XMFLOAT3& _Center, _In_ const XMFLOAT3& _Extents, _In_ const XMFLOAT4& _Orientation ) - : Center(_Center), Extents(_Extents), Orientation(_Orientation) {} - BoundingOrientedBox( _In_ const BoundingOrientedBox& box ) - : Center(box.Center), Extents(box.Extents), Orientation(box.Orientation) {} - - // Methods - BoundingOrientedBox& operator=( _In_ const BoundingOrientedBox& box ) { Center = box.Center; Extents = box.Extents; Orientation = box.Orientation; return *this; } - - void XM_CALLCONV Transform( _Out_ BoundingOrientedBox& Out, _In_ FXMMATRIX M ) const; - void XM_CALLCONV Transform( _Out_ BoundingOrientedBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; - - void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const; - // Gets the 8 corners of the box - - ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR Point ) const; - ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; - ContainmentType Contains( _In_ const BoundingSphere& sh ) const; - ContainmentType Contains( _In_ const BoundingBox& box ) const; - ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; - ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; - - bool Intersects( _In_ const BoundingSphere& sh ) const; - bool Intersects( _In_ const BoundingBox& box ) const; - bool Intersects( _In_ const BoundingOrientedBox& box ) const; - bool Intersects( _In_ const BoundingFrustum& fr ) const; - - bool XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; - // Triangle-OrientedBox test - - PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR Plane ) const; - // Plane-OrientedBox test - - bool XM_CALLCONV Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; - // Ray-OrientedBox test - - ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, - _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const; - // Test OrientedBox against six planes (see BoundingFrustum::GetPlanes) - - // Static methods - static void CreateFromBoundingBox( _Out_ BoundingOrientedBox& Out, _In_ const BoundingBox& box ); - - static void CreateFromPoints( _Out_ BoundingOrientedBox& Out, _In_ size_t Count, - _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride ); -}; - -//------------------------------------------------------------------------------------- -// Bounding frustum -//------------------------------------------------------------------------------------- -struct BoundingFrustum -{ - static const size_t CORNER_COUNT = 8; - - XMFLOAT3 Origin; // Origin of the frustum (and projection). - XMFLOAT4 Orientation; // Quaternion representing rotation. - - float RightSlope; // Positive X slope (X/Z). - float LeftSlope; // Negative X slope. - float TopSlope; // Positive Y slope (Y/Z). - float BottomSlope; // Negative Y slope. - float Near, Far; // Z of the near plane and far plane. - - // Creators - BoundingFrustum() : Origin(0,0,0), Orientation(0,0,0, 1.f), RightSlope( 1.f ), LeftSlope( -1.f ), - TopSlope( 1.f ), BottomSlope( -1.f ), Near(0), Far( 1.f ) {} - XM_CONSTEXPR BoundingFrustum( _In_ const XMFLOAT3& _Origin, _In_ const XMFLOAT4& _Orientation, - _In_ float _RightSlope, _In_ float _LeftSlope, _In_ float _TopSlope, _In_ float _BottomSlope, - _In_ float _Near, _In_ float _Far ) - : Origin(_Origin), Orientation(_Orientation), - RightSlope(_RightSlope), LeftSlope(_LeftSlope), TopSlope(_TopSlope), BottomSlope(_BottomSlope), - Near(_Near), Far(_Far) {} - BoundingFrustum( _In_ const BoundingFrustum& fr ) - : Origin(fr.Origin), Orientation(fr.Orientation), RightSlope(fr.RightSlope), LeftSlope(fr.LeftSlope), - TopSlope(fr.TopSlope), BottomSlope(fr.BottomSlope), Near(fr.Near), Far(fr.Far) {} - BoundingFrustum( _In_ CXMMATRIX Projection ) { CreateFromMatrix( *this, Projection ); } - - // Methods - BoundingFrustum& operator=( _In_ const BoundingFrustum& fr ) { Origin=fr.Origin; Orientation=fr.Orientation; - RightSlope=fr.RightSlope; LeftSlope=fr.LeftSlope; - TopSlope=fr.TopSlope; BottomSlope=fr.BottomSlope; - Near=fr.Near; Far=fr.Far; return *this; } - - void XM_CALLCONV Transform( _Out_ BoundingFrustum& Out, _In_ FXMMATRIX M ) const; - void XM_CALLCONV Transform( _Out_ BoundingFrustum& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; - - void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const; - // Gets the 8 corners of the frustum - - ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR Point ) const; - ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; - ContainmentType Contains( _In_ const BoundingSphere& sp ) const; - ContainmentType Contains( _In_ const BoundingBox& box ) const; - ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; - ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; - // Frustum-Frustum test - - bool Intersects( _In_ const BoundingSphere& sh ) const; - bool Intersects( _In_ const BoundingBox& box ) const; - bool Intersects( _In_ const BoundingOrientedBox& box ) const; - bool Intersects( _In_ const BoundingFrustum& fr ) const; - - bool XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; - // Triangle-Frustum test - - PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR Plane ) const; - // Plane-Frustum test - - bool XM_CALLCONV Intersects( _In_ FXMVECTOR rayOrigin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; - // Ray-Frustum test - - ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, - _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const; - // Test frustum against six planes (see BoundingFrustum::GetPlanes) - - void GetPlanes( _Out_opt_ XMVECTOR* NearPlane, _Out_opt_ XMVECTOR* FarPlane, _Out_opt_ XMVECTOR* RightPlane, - _Out_opt_ XMVECTOR* LeftPlane, _Out_opt_ XMVECTOR* TopPlane, _Out_opt_ XMVECTOR* BottomPlane ) const; - // Create 6 Planes representation of Frustum - - // Static methods - static void XM_CALLCONV CreateFromMatrix( _Out_ BoundingFrustum& Out, _In_ FXMMATRIX Projection ); -}; - -//----------------------------------------------------------------------------- -// Triangle intersection testing routines. -//----------------------------------------------------------------------------- -namespace TriangleTests -{ - bool XM_CALLCONV Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _In_ FXMVECTOR V0, _In_ GXMVECTOR V1, _In_ HXMVECTOR V2, _Out_ float& Dist ); - // Ray-Triangle - - bool XM_CALLCONV Intersects( _In_ FXMVECTOR A0, _In_ FXMVECTOR A1, _In_ FXMVECTOR A2, _In_ GXMVECTOR B0, _In_ HXMVECTOR B1, _In_ HXMVECTOR B2 ); - // Triangle-Triangle - - PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, _In_ GXMVECTOR Plane ); - // Plane-Triangle - - ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, - _In_ GXMVECTOR Plane0, _In_ HXMVECTOR Plane1, _In_ HXMVECTOR Plane2, - _In_ CXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5 ); - // Test a triangle against six planes at once (see BoundingFrustum::GetPlanes) -}; - -#pragma warning(pop) - -/**************************************************************************** - * - * Implementation - * - ****************************************************************************/ - -#pragma warning(push) -#pragma warning(disable : 4068 4365 4616 6001) -// C4068/4616: ignore unknown pragmas -// C4365: Off by default noise -// C6001: False positives - -#pragma prefast(push) -#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") - -#include "DirectXCollision.inl" - -#pragma prefast(pop) -#pragma warning(pop) - -}; // namespace DirectX - +//------------------------------------------------------------------------------------- +// DirectXCollision.h -- C++ Collision Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMath.h" + +namespace DirectX +{ + +enum ContainmentType +{ + DISJOINT = 0, + INTERSECTS = 1, + CONTAINS = 2, +}; + +enum PlaneIntersectionType +{ + FRONT = 0, + INTERSECTING = 1, + BACK = 2, +}; + +struct BoundingBox; +struct BoundingOrientedBox; +struct BoundingFrustum; + +#pragma warning(push) +#pragma warning(disable:4324 4820) +// C4324: alignment padding warnings +// C4820: Off by default noise + +//------------------------------------------------------------------------------------- +// Bounding sphere +//------------------------------------------------------------------------------------- +struct BoundingSphere +{ + XMFLOAT3 Center; // Center of the sphere. + float Radius; // Radius of the sphere. + + // Creators + BoundingSphere() : Center(0,0,0), Radius( 1.f ) {} + XM_CONSTEXPR BoundingSphere( _In_ const XMFLOAT3& center, _In_ float radius ) + : Center(center), Radius(radius) {} + BoundingSphere( _In_ const BoundingSphere& sp ) + : Center(sp.Center), Radius(sp.Radius) {} + + // Methods + BoundingSphere& operator=( _In_ const BoundingSphere& sp ) { Center = sp.Center; Radius = sp.Radius; return *this; } + + void XM_CALLCONV Transform( _Out_ BoundingSphere& Out, _In_ FXMMATRIX M ) const; + void XM_CALLCONV Transform( _Out_ BoundingSphere& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; + // Transform the sphere + + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR Point ) const; + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + ContainmentType Contains( _In_ const BoundingSphere& sh ) const; + ContainmentType Contains( _In_ const BoundingBox& box ) const; + ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; + ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ const BoundingSphere& sh ) const; + bool Intersects( _In_ const BoundingBox& box ) const; + bool Intersects( _In_ const BoundingOrientedBox& box ) const; + bool Intersects( _In_ const BoundingFrustum& fr ) const; + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + // Triangle-sphere test + + PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR Plane ) const; + // Plane-sphere test + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; + // Ray-sphere test + + ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const; + // Test sphere against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateMerged( _Out_ BoundingSphere& Out, _In_ const BoundingSphere& S1, _In_ const BoundingSphere& S2 ); + + static void CreateFromBoundingBox( _Out_ BoundingSphere& Out, _In_ const BoundingBox& box ); + static void CreateFromBoundingBox( _Out_ BoundingSphere& Out, _In_ const BoundingOrientedBox& box ); + + static void CreateFromPoints( _Out_ BoundingSphere& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride ); + + static void CreateFromFrustum( _Out_ BoundingSphere& Out, _In_ const BoundingFrustum& fr ); +}; + +//------------------------------------------------------------------------------------- +// Axis-aligned bounding box +//------------------------------------------------------------------------------------- +struct BoundingBox +{ + static const size_t CORNER_COUNT = 8; + + XMFLOAT3 Center; // Center of the box. + XMFLOAT3 Extents; // Distance from the center to each side. + + // Creators + BoundingBox() : Center(0,0,0), Extents( 1.f, 1.f, 1.f ) {} + XM_CONSTEXPR BoundingBox( _In_ const XMFLOAT3& center, _In_ const XMFLOAT3& extents ) + : Center(center), Extents(extents) {} + BoundingBox( _In_ const BoundingBox& box ) : Center(box.Center), Extents(box.Extents) {} + + // Methods + BoundingBox& operator=( _In_ const BoundingBox& box) { Center = box.Center; Extents = box.Extents; return *this; } + + void XM_CALLCONV Transform( _Out_ BoundingBox& Out, _In_ FXMMATRIX M ) const; + void XM_CALLCONV Transform( _Out_ BoundingBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; + + void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const; + // Gets the 8 corners of the box + + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR Point ) const; + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + ContainmentType Contains( _In_ const BoundingSphere& sh ) const; + ContainmentType Contains( _In_ const BoundingBox& box ) const; + ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; + ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ const BoundingSphere& sh ) const; + bool Intersects( _In_ const BoundingBox& box ) const; + bool Intersects( _In_ const BoundingOrientedBox& box ) const; + bool Intersects( _In_ const BoundingFrustum& fr ) const; + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + // Triangle-Box test + + PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR Plane ) const; + // Plane-box test + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; + // Ray-Box test + + ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const; + // Test box against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateMerged( _Out_ BoundingBox& Out, _In_ const BoundingBox& b1, _In_ const BoundingBox& b2 ); + + static void CreateFromSphere( _Out_ BoundingBox& Out, _In_ const BoundingSphere& sh ); + + static void XM_CALLCONV CreateFromPoints( _Out_ BoundingBox& Out, _In_ FXMVECTOR pt1, _In_ FXMVECTOR pt2 ); + static void CreateFromPoints( _Out_ BoundingBox& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride ); +}; + +//------------------------------------------------------------------------------------- +// Oriented bounding box +//------------------------------------------------------------------------------------- +struct BoundingOrientedBox +{ + static const size_t CORNER_COUNT = 8; + + XMFLOAT3 Center; // Center of the box. + XMFLOAT3 Extents; // Distance from the center to each side. + XMFLOAT4 Orientation; // Unit quaternion representing rotation (box -> world). + + // Creators + BoundingOrientedBox() : Center(0,0,0), Extents( 1.f, 1.f, 1.f ), Orientation(0,0,0, 1.f ) {} + XM_CONSTEXPR BoundingOrientedBox( _In_ const XMFLOAT3& _Center, _In_ const XMFLOAT3& _Extents, _In_ const XMFLOAT4& _Orientation ) + : Center(_Center), Extents(_Extents), Orientation(_Orientation) {} + BoundingOrientedBox( _In_ const BoundingOrientedBox& box ) + : Center(box.Center), Extents(box.Extents), Orientation(box.Orientation) {} + + // Methods + BoundingOrientedBox& operator=( _In_ const BoundingOrientedBox& box ) { Center = box.Center; Extents = box.Extents; Orientation = box.Orientation; return *this; } + + void XM_CALLCONV Transform( _Out_ BoundingOrientedBox& Out, _In_ FXMMATRIX M ) const; + void XM_CALLCONV Transform( _Out_ BoundingOrientedBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; + + void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const; + // Gets the 8 corners of the box + + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR Point ) const; + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + ContainmentType Contains( _In_ const BoundingSphere& sh ) const; + ContainmentType Contains( _In_ const BoundingBox& box ) const; + ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; + ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ const BoundingSphere& sh ) const; + bool Intersects( _In_ const BoundingBox& box ) const; + bool Intersects( _In_ const BoundingOrientedBox& box ) const; + bool Intersects( _In_ const BoundingFrustum& fr ) const; + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + // Triangle-OrientedBox test + + PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR Plane ) const; + // Plane-OrientedBox test + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; + // Ray-OrientedBox test + + ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const; + // Test OrientedBox against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateFromBoundingBox( _Out_ BoundingOrientedBox& Out, _In_ const BoundingBox& box ); + + static void CreateFromPoints( _Out_ BoundingOrientedBox& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride ); +}; + +//------------------------------------------------------------------------------------- +// Bounding frustum +//------------------------------------------------------------------------------------- +struct BoundingFrustum +{ + static const size_t CORNER_COUNT = 8; + + XMFLOAT3 Origin; // Origin of the frustum (and projection). + XMFLOAT4 Orientation; // Quaternion representing rotation. + + float RightSlope; // Positive X slope (X/Z). + float LeftSlope; // Negative X slope. + float TopSlope; // Positive Y slope (Y/Z). + float BottomSlope; // Negative Y slope. + float Near, Far; // Z of the near plane and far plane. + + // Creators + BoundingFrustum() : Origin(0,0,0), Orientation(0,0,0, 1.f), RightSlope( 1.f ), LeftSlope( -1.f ), + TopSlope( 1.f ), BottomSlope( -1.f ), Near(0), Far( 1.f ) {} + XM_CONSTEXPR BoundingFrustum( _In_ const XMFLOAT3& _Origin, _In_ const XMFLOAT4& _Orientation, + _In_ float _RightSlope, _In_ float _LeftSlope, _In_ float _TopSlope, _In_ float _BottomSlope, + _In_ float _Near, _In_ float _Far ) + : Origin(_Origin), Orientation(_Orientation), + RightSlope(_RightSlope), LeftSlope(_LeftSlope), TopSlope(_TopSlope), BottomSlope(_BottomSlope), + Near(_Near), Far(_Far) {} + BoundingFrustum( _In_ const BoundingFrustum& fr ) + : Origin(fr.Origin), Orientation(fr.Orientation), RightSlope(fr.RightSlope), LeftSlope(fr.LeftSlope), + TopSlope(fr.TopSlope), BottomSlope(fr.BottomSlope), Near(fr.Near), Far(fr.Far) {} + BoundingFrustum( _In_ CXMMATRIX Projection ) { CreateFromMatrix( *this, Projection ); } + + // Methods + BoundingFrustum& operator=( _In_ const BoundingFrustum& fr ) { Origin=fr.Origin; Orientation=fr.Orientation; + RightSlope=fr.RightSlope; LeftSlope=fr.LeftSlope; + TopSlope=fr.TopSlope; BottomSlope=fr.BottomSlope; + Near=fr.Near; Far=fr.Far; return *this; } + + void XM_CALLCONV Transform( _Out_ BoundingFrustum& Out, _In_ FXMMATRIX M ) const; + void XM_CALLCONV Transform( _Out_ BoundingFrustum& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; + + void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const; + // Gets the 8 corners of the frustum + + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR Point ) const; + ContainmentType XM_CALLCONV Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + ContainmentType Contains( _In_ const BoundingSphere& sp ) const; + ContainmentType Contains( _In_ const BoundingBox& box ) const; + ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; + ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; + // Frustum-Frustum test + + bool Intersects( _In_ const BoundingSphere& sh ) const; + bool Intersects( _In_ const BoundingBox& box ) const; + bool Intersects( _In_ const BoundingOrientedBox& box ) const; + bool Intersects( _In_ const BoundingFrustum& fr ) const; + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + // Triangle-Frustum test + + PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR Plane ) const; + // Plane-Frustum test + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR rayOrigin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; + // Ray-Frustum test + + ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5 ) const; + // Test frustum against six planes (see BoundingFrustum::GetPlanes) + + void GetPlanes( _Out_opt_ XMVECTOR* NearPlane, _Out_opt_ XMVECTOR* FarPlane, _Out_opt_ XMVECTOR* RightPlane, + _Out_opt_ XMVECTOR* LeftPlane, _Out_opt_ XMVECTOR* TopPlane, _Out_opt_ XMVECTOR* BottomPlane ) const; + // Create 6 Planes representation of Frustum + + // Static methods + static void XM_CALLCONV CreateFromMatrix( _Out_ BoundingFrustum& Out, _In_ FXMMATRIX Projection ); +}; + +//----------------------------------------------------------------------------- +// Triangle intersection testing routines. +//----------------------------------------------------------------------------- +namespace TriangleTests +{ + bool XM_CALLCONV Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _In_ FXMVECTOR V0, _In_ GXMVECTOR V1, _In_ HXMVECTOR V2, _Out_ float& Dist ); + // Ray-Triangle + + bool XM_CALLCONV Intersects( _In_ FXMVECTOR A0, _In_ FXMVECTOR A1, _In_ FXMVECTOR A2, _In_ GXMVECTOR B0, _In_ HXMVECTOR B1, _In_ HXMVECTOR B2 ); + // Triangle-Triangle + + PlaneIntersectionType XM_CALLCONV Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, _In_ GXMVECTOR Plane ); + // Plane-Triangle + + ContainmentType XM_CALLCONV ContainedBy( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, + _In_ GXMVECTOR Plane0, _In_ HXMVECTOR Plane1, _In_ HXMVECTOR Plane2, + _In_ CXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5 ); + // Test a triangle against six planes at once (see BoundingFrustum::GetPlanes) +}; + +#pragma warning(pop) + +/**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable : 4068 4365 4616 6001) +// C4068/4616: ignore unknown pragmas +// C4365: Off by default noise +// C6001: False positives + +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") + +#include "DirectXCollision.inl" + +#pragma prefast(pop) +#pragma warning(pop) + +}; // namespace DirectX + diff --git a/Inc/DirectXCollision.inl b/Inc/DirectXCollision.inl index 470e28b..752bba2 100644 --- a/Inc/DirectXCollision.inl +++ b/Inc/DirectXCollision.inl @@ -1,4811 +1,4811 @@ -//------------------------------------------------------------------------------------- -// DirectXCollision.inl -- C++ Collision Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#pragma once - -XMGLOBALCONST XMVECTORF32 g_BoxOffset[8] = -{ - { -1.0f, -1.0f, 1.0f, 0.0f }, - { 1.0f, -1.0f, 1.0f, 0.0f }, - { 1.0f, 1.0f, 1.0f, 0.0f }, - { -1.0f, 1.0f, 1.0f, 0.0f }, - { -1.0f, -1.0f, -1.0f, 0.0f }, - { 1.0f, -1.0f, -1.0f, 0.0f }, - { 1.0f, 1.0f, -1.0f, 0.0f }, - { -1.0f, 1.0f, -1.0f, 0.0f }, -}; - -XMGLOBALCONST XMVECTORF32 g_RayEpsilon = { 1e-20f, 1e-20f, 1e-20f, 1e-20f }; -XMGLOBALCONST XMVECTORF32 g_RayNegEpsilon = { -1e-20f, -1e-20f, -1e-20f, -1e-20f }; -XMGLOBALCONST XMVECTORF32 g_FltMin = { -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX }; -XMGLOBALCONST XMVECTORF32 g_FltMax = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX }; - -namespace Internal -{ - -//----------------------------------------------------------------------------- -// Return true if any of the elements of a 3 vector are equal to 0xffffffff. -// Slightly more efficient than using XMVector3EqualInt. -//----------------------------------------------------------------------------- -inline bool XMVector3AnyTrue( _In_ FXMVECTOR V ) -{ - // Duplicate the fourth element from the first element. - XMVECTOR C = XMVectorSwizzle(V); - - return XMComparisonAnyTrue( XMVector4EqualIntR( C, XMVectorTrueInt() ) ); -} - - -//----------------------------------------------------------------------------- -// Return true if all of the elements of a 3 vector are equal to 0xffffffff. -// Slightly more efficient than using XMVector3EqualInt. -//----------------------------------------------------------------------------- -inline bool XMVector3AllTrue( _In_ FXMVECTOR V ) -{ - // Duplicate the fourth element from the first element. - XMVECTOR C = XMVectorSwizzle( V ); - - return XMComparisonAllTrue( XMVector4EqualIntR( C, XMVectorTrueInt() ) ); -} - -#if defined(_PREFAST) || !defined(NDEBUG) - -XMGLOBALCONST XMVECTORF32 g_UnitVectorEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f }; -XMGLOBALCONST XMVECTORF32 g_UnitQuaternionEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f }; -XMGLOBALCONST XMVECTORF32 g_UnitPlaneEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f }; - -//----------------------------------------------------------------------------- -// Return true if the vector is a unit vector (length == 1). -//----------------------------------------------------------------------------- -inline bool XMVector3IsUnit( _In_ FXMVECTOR V ) -{ - XMVECTOR Difference = XMVector3Length( V ) - XMVectorSplatOne(); - return XMVector4Less( XMVectorAbs( Difference ), g_UnitVectorEpsilon ); -} - -//----------------------------------------------------------------------------- -// Return true if the quaterion is a unit quaternion. -//----------------------------------------------------------------------------- -inline bool XMQuaternionIsUnit( _In_ FXMVECTOR Q ) -{ - XMVECTOR Difference = XMVector4Length( Q ) - XMVectorSplatOne(); - return XMVector4Less( XMVectorAbs( Difference ), g_UnitQuaternionEpsilon ); -} - -//----------------------------------------------------------------------------- -// Return true if the plane is a unit plane. -//----------------------------------------------------------------------------- -inline bool XMPlaneIsUnit( _In_ FXMVECTOR Plane ) -{ - XMVECTOR Difference = XMVector3Length( Plane ) - XMVectorSplatOne(); - return XMVector4Less( XMVectorAbs( Difference ), g_UnitPlaneEpsilon ); -} - -#endif // __PREFAST__ || !NDEBUG - -//----------------------------------------------------------------------------- -inline XMVECTOR XMPlaneTransform( _In_ FXMVECTOR Plane, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) -{ - XMVECTOR vNormal = XMVector3Rotate( Plane, Rotation ); - XMVECTOR vD = XMVectorSplatW( Plane ) - XMVector3Dot( vNormal, Translation ); - - return XMVectorInsert<0, 0, 0, 0, 1>( vNormal, vD ); -} - -//----------------------------------------------------------------------------- -// Return the point on the line segement (S1, S2) nearest the point P. -//----------------------------------------------------------------------------- -inline XMVECTOR PointOnLineSegmentNearestPoint( _In_ FXMVECTOR S1, _In_ FXMVECTOR S2, _In_ FXMVECTOR P ) -{ - XMVECTOR Dir = S2 - S1; - XMVECTOR Projection = ( XMVector3Dot( P, Dir ) - XMVector3Dot( S1, Dir ) ); - XMVECTOR LengthSq = XMVector3Dot( Dir, Dir ); - - XMVECTOR t = Projection * XMVectorReciprocal( LengthSq ); - XMVECTOR Point = S1 + t * Dir; - - // t < 0 - XMVECTOR SelectS1 = XMVectorLess( Projection, XMVectorZero() ); - Point = XMVectorSelect( Point, S1, SelectS1 ); - - // t > 1 - XMVECTOR SelectS2 = XMVectorGreater( Projection, LengthSq ); - Point = XMVectorSelect( Point, S2, SelectS2 ); - - return Point; -} - -//----------------------------------------------------------------------------- -// Test if the point (P) on the plane of the triangle is inside the triangle -// (V0, V1, V2). -//----------------------------------------------------------------------------- -inline XMVECTOR XM_CALLCONV PointOnPlaneInsideTriangle( _In_ FXMVECTOR P, _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ GXMVECTOR V2 ) -{ - // Compute the triangle normal. - XMVECTOR N = XMVector3Cross( V2 - V0, V1 - V0 ); - - // Compute the cross products of the vector from the base of each edge to - // the point with each edge vector. - XMVECTOR C0 = XMVector3Cross( P - V0, V1 - V0 ); - XMVECTOR C1 = XMVector3Cross( P - V1, V2 - V1 ); - XMVECTOR C2 = XMVector3Cross( P - V2, V0 - V2 ); - - // If the cross product points in the same direction as the normal the the - // point is inside the edge (it is zero if is on the edge). - XMVECTOR Zero = XMVectorZero(); - XMVECTOR Inside0 = XMVectorGreaterOrEqual( XMVector3Dot( C0, N ), Zero ); - XMVECTOR Inside1 = XMVectorGreaterOrEqual( XMVector3Dot( C1, N ), Zero ); - XMVECTOR Inside2 = XMVectorGreaterOrEqual( XMVector3Dot( C2, N ), Zero ); - - // If the point inside all of the edges it is inside. - return XMVectorAndInt( XMVectorAndInt( Inside0, Inside1 ), Inside2 ); -} - -//----------------------------------------------------------------------------- -inline bool SolveCubic( _In_ float e, _In_ float f, _In_ float g, _Out_ float* t, _Out_ float* u, _Out_ float* v ) -{ - float p, q, h, rc, d, theta, costh3, sinth3; - - p = f - e * e / 3.0f; - q = g - e * f / 3.0f + e * e * e * 2.0f / 27.0f; - h = q * q / 4.0f + p * p * p / 27.0f; - - if( h > 0.0 ) - { - *t = *u = *v = 0.f; - return false; // only one real root - } - - if( ( h == 0.0 ) && ( q == 0.0 ) ) // all the same root - { - *t = - e / 3; - *u = - e / 3; - *v = - e / 3; - - return true; - } - - d = sqrtf( q * q / 4.0f - h ); - if( d < 0 ) - rc = -powf( -d, 1.0f / 3.0f ); - else - rc = powf( d, 1.0f / 3.0f ); - - theta = XMScalarACos( -q / ( 2.0f * d ) ); - costh3 = XMScalarCos( theta / 3.0f ); - sinth3 = sqrtf( 3.0f ) * XMScalarSin( theta / 3.0f ); - *t = 2.0f * rc * costh3 - e / 3.0f; - *u = -rc * ( costh3 + sinth3 ) - e / 3.0f; - *v = -rc * ( costh3 - sinth3 ) - e / 3.0f; - - return true; -} - -//----------------------------------------------------------------------------- -inline XMVECTOR CalculateEigenVector( _In_ float m11, _In_ float m12, _In_ float m13, - _In_ float m22, _In_ float m23, _In_ float m33, _In_ float e ) -{ - float fTmp[3]; - fTmp[0] = ( float )( m12 * m23 - m13 * ( m22 - e ) ); - fTmp[1] = ( float )( m13 * m12 - m23 * ( m11 - e ) ); - fTmp[2] = ( float )( ( m11 - e ) * ( m22 - e ) - m12 * m12 ); - - XMVECTOR vTmp = XMLoadFloat3( (XMFLOAT3*)fTmp ); - - if( XMVector3Equal( vTmp, XMVectorZero() ) ) // planar or linear - { - float f1, f2, f3; - - // we only have one equation - find a valid one - if( ( m11 - e != 0.0 ) || ( m12 != 0.0 ) || ( m13 != 0.0 ) ) - { - f1 = m11 - e; f2 = m12; f3 = m13; - } - else if( ( m12 != 0.0 ) || ( m22 - e != 0.0 ) || ( m23 != 0.0 ) ) - { - f1 = m12; f2 = m22 - e; f3 = m23; - } - else if( ( m13 != 0.0 ) || ( m23 != 0.0 ) || ( m33 - e != 0.0 ) ) - { - f1 = m13; f2 = m23; f3 = m33 - e; - } - else - { - // error, we'll just make something up - we have NO context - f1 = 1.0; f2 = 0.0; f3 = 0.0; - } - - if( f1 == 0.0 ) - vTmp = XMVectorSetX( vTmp, 0.0f ); - else - vTmp = XMVectorSetX( vTmp, 1.0f ); - - if( f2 == 0.0 ) - vTmp = XMVectorSetY( vTmp, 0.0f ); - else - vTmp = XMVectorSetY( vTmp, 1.0f ); - - if( f3 == 0.0 ) - { - vTmp = XMVectorSetZ( vTmp, 0.0f ); - // recalculate y to make equation work - if( m12 != 0.0 ) - vTmp = XMVectorSetY( vTmp, ( float )( -f1 / f2 ) ); - } - else - { - vTmp = XMVectorSetZ( vTmp, ( float )( ( f2 - f1 ) / f3 ) ); - } - } - - if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) > 1e-5f ) - { - return XMVector3Normalize( vTmp ); - } - else - { - // Multiply by a value large enough to make the vector non-zero. - vTmp *= 1e5f; - return XMVector3Normalize( vTmp ); - } -} - -//----------------------------------------------------------------------------- -inline bool CalculateEigenVectors( _In_ float m11, _In_ float m12, _In_ float m13, - _In_ float m22, _In_ float m23, _In_ float m33, - _In_ float e1, _In_ float e2, _In_ float e3, - _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3 ) -{ - *pV1 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e1 ); - *pV2 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e2 ); - *pV3 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e3 ); - - bool v1z = false; - bool v2z = false; - bool v3z = false; - - XMVECTOR Zero = XMVectorZero(); - - if ( XMVector3Equal( *pV1, Zero ) ) - v1z = true; - - if ( XMVector3Equal( *pV2, Zero ) ) - v2z = true; - - if ( XMVector3Equal( *pV3, Zero )) - v3z = true; - - bool e12 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV1, *pV2 ) ) ) > 0.1f ); // check for non-orthogonal vectors - bool e13 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV1, *pV3 ) ) ) > 0.1f ); - bool e23 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV2, *pV3 ) ) ) > 0.1f ); - - if( ( v1z && v2z && v3z ) || ( e12 && e13 && e23 ) || - ( e12 && v3z ) || ( e13 && v2z ) || ( e23 && v1z ) ) // all eigenvectors are 0- any basis set - { - *pV1 = g_XMIdentityR0.v; - *pV2 = g_XMIdentityR1.v; - *pV3 = g_XMIdentityR2.v; - return true; - } - - if( v1z && v2z ) - { - XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV3 ); - if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f ) - { - vTmp = XMVector3Cross( g_XMIdentityR0, *pV3 ); - } - *pV1 = XMVector3Normalize( vTmp ); - *pV2 = XMVector3Cross( *pV3, *pV1 ); - return true; - } - - if( v3z && v1z ) - { - XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV2 ); - if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f ) - { - vTmp = XMVector3Cross( g_XMIdentityR0, *pV2 ); - } - *pV3 = XMVector3Normalize( vTmp ); - *pV1 = XMVector3Cross( *pV2, *pV3 ); - return true; - } - - if( v2z && v3z ) - { - XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV1 ); - if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f ) - { - vTmp = XMVector3Cross( g_XMIdentityR0, *pV1 ); - } - *pV2 = XMVector3Normalize( vTmp ); - *pV3 = XMVector3Cross( *pV1, *pV2 ); - return true; - } - - if( ( v1z ) || e12 ) - { - *pV1 = XMVector3Cross( *pV2, *pV3 ); - return true; - } - - if( ( v2z ) || e23 ) - { - *pV2 = XMVector3Cross( *pV3, *pV1 ); - return true; - } - - if( ( v3z ) || e13 ) - { - *pV3 = XMVector3Cross( *pV1, *pV2 ); - return true; - } - - return true; -} - -//----------------------------------------------------------------------------- -inline bool CalculateEigenVectorsFromCovarianceMatrix( _In_ float Cxx, _In_ float Cyy, _In_ float Czz, - _In_ float Cxy, _In_ float Cxz, _In_ float Cyz, - _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3 ) -{ - // Calculate the eigenvalues by solving a cubic equation. - float e = -( Cxx + Cyy + Czz ); - float f = Cxx * Cyy + Cyy * Czz + Czz * Cxx - Cxy * Cxy - Cxz * Cxz - Cyz * Cyz; - float g = Cxy * Cxy * Czz + Cxz * Cxz * Cyy + Cyz * Cyz * Cxx - Cxy * Cyz * Cxz * 2.0f - Cxx * Cyy * Czz; - - float ev1, ev2, ev3; - if( !DirectX::Internal::SolveCubic( e, f, g, &ev1, &ev2, &ev3 ) ) - { - // set them to arbitrary orthonormal basis set - *pV1 = g_XMIdentityR0.v; - *pV2 = g_XMIdentityR1.v; - *pV3 = g_XMIdentityR2.v; - return false; - } - - return DirectX::Internal::CalculateEigenVectors( Cxx, Cxy, Cxz, Cyy, Cyz, Czz, ev1, ev2, ev3, pV1, pV2, pV3 ); -} - -//----------------------------------------------------------------------------- -inline void XM_CALLCONV FastIntersectTrianglePlane( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane, - XMVECTOR& Outside, XMVECTOR& Inside ) -{ - // Plane0 - XMVECTOR Dist0 = XMVector4Dot( V0, Plane ); - XMVECTOR Dist1 = XMVector4Dot( V1, Plane ); - XMVECTOR Dist2 = XMVector4Dot( V2, Plane ); - - XMVECTOR MinDist = XMVectorMin( Dist0, Dist1 ); - MinDist = XMVectorMin( MinDist, Dist2 ); - - XMVECTOR MaxDist = XMVectorMax( Dist0, Dist1 ); - MaxDist = XMVectorMax( MaxDist, Dist2 ); - - XMVECTOR Zero = XMVectorZero(); - - // Outside the plane? - Outside = XMVectorGreater( MinDist, Zero ); - - // Fully inside the plane? - Inside = XMVectorLess( MaxDist, Zero ); -} - -//----------------------------------------------------------------------------- -inline void FastIntersectSpherePlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Radius, _In_ FXMVECTOR Plane, - _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) -{ - XMVECTOR Dist = XMVector4Dot( Center, Plane ); - - // Outside the plane? - Outside = XMVectorGreater( Dist, Radius ); - - // Fully inside the plane? - Inside = XMVectorLess( Dist, -Radius ); -} - -//----------------------------------------------------------------------------- -inline void FastIntersectAxisAlignedBoxPlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Plane, - _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) -{ - // Compute the distance to the center of the box. - XMVECTOR Dist = XMVector4Dot( Center, Plane ); - - // Project the axes of the box onto the normal of the plane. Half the - // length of the projection (sometime called the "radius") is equal to - // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) - // where h(i) are extents of the box, n is the plane normal, and b(i) are the - // axes of the box. In this case b(i) = [(1,0,0), (0,1,0), (0,0,1)]. - XMVECTOR Radius = XMVector3Dot( Extents, XMVectorAbs( Plane ) ); - - // Outside the plane? - Outside = XMVectorGreater( Dist, Radius ); - - // Fully inside the plane? - Inside = XMVectorLess( Dist, -Radius ); -} - -//----------------------------------------------------------------------------- -inline void XM_CALLCONV FastIntersectOrientedBoxPlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Axis0, _In_ GXMVECTOR Axis1, - _In_ HXMVECTOR Axis2, _In_ HXMVECTOR Plane, _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) -{ - // Compute the distance to the center of the box. - XMVECTOR Dist = XMVector4Dot( Center, Plane ); - - // Project the axes of the box onto the normal of the plane. Half the - // length of the projection (sometime called the "radius") is equal to - // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) - // where h(i) are extents of the box, n is the plane normal, and b(i) are the - // axes of the box. - XMVECTOR Radius = XMVector3Dot( Plane, Axis0 ); - Radius = XMVectorInsert<0, 0, 1, 0, 0>( Radius, XMVector3Dot( Plane, Axis1 ) ); - Radius = XMVectorInsert<0, 0, 0, 1, 0>( Radius, XMVector3Dot( Plane, Axis2 ) ); - Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) ); - - // Outside the plane? - Outside = XMVectorGreater( Dist, Radius ); - - // Fully inside the plane? - Inside = XMVectorLess( Dist, -Radius ); -} - -//----------------------------------------------------------------------------- -inline void XM_CALLCONV FastIntersectFrustumPlane( _In_ FXMVECTOR Point0, _In_ FXMVECTOR Point1, _In_ FXMVECTOR Point2, _In_ GXMVECTOR Point3, - _In_ HXMVECTOR Point4, _In_ HXMVECTOR Point5, _In_ CXMVECTOR Point6, _In_ CXMVECTOR Point7, - _In_ CXMVECTOR Plane, _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) -{ - // Find the min/max projection of the frustum onto the plane normal. - XMVECTOR Min, Max, Dist; - - Min = Max = XMVector3Dot( Plane, Point0 ); - - Dist = XMVector3Dot( Plane, Point1 ); - Min = XMVectorMin( Min, Dist ); - Max = XMVectorMax( Max, Dist ); - - Dist = XMVector3Dot( Plane, Point2 ); - Min = XMVectorMin( Min, Dist ); - Max = XMVectorMax( Max, Dist ); - - Dist = XMVector3Dot( Plane, Point3 ); - Min = XMVectorMin( Min, Dist ); - Max = XMVectorMax( Max, Dist ); - - Dist = XMVector3Dot( Plane, Point4 ); - Min = XMVectorMin( Min, Dist ); - Max = XMVectorMax( Max, Dist ); - - Dist = XMVector3Dot( Plane, Point5 ); - Min = XMVectorMin( Min, Dist ); - Max = XMVectorMax( Max, Dist ); - - Dist = XMVector3Dot( Plane, Point6 ); - Min = XMVectorMin( Min, Dist ); - Max = XMVectorMax( Max, Dist ); - - Dist = XMVector3Dot( Plane, Point7 ); - Min = XMVectorMin( Min, Dist ); - Max = XMVectorMax( Max, Dist ); - - XMVECTOR PlaneDist = -XMVectorSplatW( Plane ); - - // Outside the plane? - Outside = XMVectorGreater( Min, PlaneDist ); - - // Fully inside the plane? - Inside = XMVectorLess( Max, PlaneDist ); -} - -}; // namespace Internal - - -/**************************************************************************** - * - * BoundingSphere - * - ****************************************************************************/ - -//----------------------------------------------------------------------------- -// Transform a sphere by an angle preserving transform. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void XM_CALLCONV BoundingSphere::Transform( BoundingSphere& Out, FXMMATRIX M ) const -{ - // Load the center of the sphere. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - - // Transform the center of the sphere. - XMVECTOR C = XMVector3Transform( vCenter, M ); - - XMVECTOR dX = XMVector3Dot( M.r[0], M.r[0] ); - XMVECTOR dY = XMVector3Dot( M.r[1], M.r[1] ); - XMVECTOR dZ = XMVector3Dot( M.r[2], M.r[2] ); - - XMVECTOR d = XMVectorMax( dX, XMVectorMax( dY, dZ ) ); - - // Store the center sphere. - XMStoreFloat3( &Out.Center, C ); - - // Scale the radius of the pshere. - float Scale = sqrtf( XMVectorGetX(d) ); - Out.Radius = Radius * Scale; -} - -_Use_decl_annotations_ -inline void XM_CALLCONV BoundingSphere::Transform( BoundingSphere& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const -{ - // Load the center of the sphere. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - - // Transform the center of the sphere. - vCenter = XMVector3Rotate( vCenter * XMVectorReplicate( Scale ), Rotation ) + Translation; - - // Store the center sphere. - XMStoreFloat3( &Out.Center, vCenter ); - - // Scale the radius of the pshere. - Out.Radius = Radius * Scale; -} - - -//----------------------------------------------------------------------------- -// Point in sphere test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV BoundingSphere::Contains( FXMVECTOR Point ) const -{ - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); - - XMVECTOR DistanceSquared = XMVector3LengthSq( Point - vCenter ); - XMVECTOR RadiusSquared = XMVectorMultiply( vRadius, vRadius ); - - return XMVector3LessOrEqual( DistanceSquared, RadiusSquared ) ? CONTAINS : DISJOINT; -} - - -//----------------------------------------------------------------------------- -// Triangle in sphere test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV BoundingSphere::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const -{ - if ( !Intersects(V0,V1,V2) ) - return DISJOINT; - - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); - XMVECTOR RadiusSquared = XMVectorMultiply( vRadius, vRadius ); - - XMVECTOR DistanceSquared = XMVector3LengthSq( V0 - vCenter ); - XMVECTOR Inside = XMVectorLessOrEqual(DistanceSquared, RadiusSquared); - - DistanceSquared = XMVector3LengthSq( V1 - vCenter ); - Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared) ); - - DistanceSquared = XMVector3LengthSq( V2 - vCenter ); - Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared) ); - - return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Sphere in sphere test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingSphere::Contains( const BoundingSphere& sh ) const -{ - XMVECTOR Center1 = XMLoadFloat3( &Center ); - float r1 = Radius; - - XMVECTOR Center2 = XMLoadFloat3( &sh.Center ); - float r2 = sh.Radius; - - XMVECTOR V = XMVectorSubtract( Center2, Center1 ); - - XMVECTOR Dist = XMVector3Length( V ); - - float d = XMVectorGetX( Dist ); - - return (r1 + r2 >= d) ? ((r1 - r2 >= d) ? CONTAINS : INTERSECTS) : DISJOINT; -} - - -//----------------------------------------------------------------------------- -// Axis-aligned box in sphere test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingSphere::Contains( const BoundingBox& box ) const -{ - if ( !box.Intersects(*this) ) - return DISJOINT; - - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); - XMVECTOR RadiusSq = vRadius * vRadius; - - XMVECTOR boxCenter = XMLoadFloat3( &box.Center ); - XMVECTOR boxExtents = XMLoadFloat3( &box.Extents ); - - XMVECTOR InsideAll = XMVectorTrueInt(); - - XMVECTOR offset = boxCenter - vCenter; - - for( size_t i = 0; i < BoundingBox::CORNER_COUNT; ++i ) - { - XMVECTOR C = XMVectorMultiplyAdd( boxExtents, g_BoxOffset[i], offset ); - XMVECTOR d = XMVector3LengthSq( C ); - InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) ); - } - - return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Oriented box in sphere test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingSphere::Contains( const BoundingOrientedBox& box ) const -{ - if ( !box.Intersects(*this) ) - return DISJOINT; - - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); - XMVECTOR RadiusSq = vRadius * vRadius; - - XMVECTOR boxCenter = XMLoadFloat3( &box.Center ); - XMVECTOR boxExtents = XMLoadFloat3( &box.Extents ); - XMVECTOR boxOrientation = XMLoadFloat4( &box.Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( boxOrientation ) ); - - XMVECTOR InsideAll = XMVectorTrueInt(); - - for( size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i ) - { - XMVECTOR C = XMVector3Rotate( boxExtents * g_BoxOffset[i], boxOrientation ) + boxCenter; - XMVECTOR d = XMVector3LengthSq( XMVectorSubtract( vCenter, C ) ); - InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) ); - } - - return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; - -} - - -//----------------------------------------------------------------------------- -// Frustum in sphere test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingSphere::Contains( const BoundingFrustum& fr ) const -{ - if ( !fr.Intersects(*this) ) - return DISJOINT; - - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); - XMVECTOR RadiusSq = vRadius * vRadius; - - XMVECTOR vOrigin = XMLoadFloat3( &fr.Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &fr.Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - // Build the corners of the frustum. - XMVECTOR vRightTop = XMVectorSet( fr.RightSlope, fr.TopSlope, 1.0f, 0.0f ); - XMVECTOR vRightBottom = XMVectorSet( fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f ); - XMVECTOR vLeftTop = XMVectorSet( fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f ); - XMVECTOR vLeftBottom = XMVectorSet( fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f ); - XMVECTOR vNear = XMVectorReplicatePtr( &fr.Near ); - XMVECTOR vFar = XMVectorReplicatePtr( &fr.Far ); - - XMVECTOR Corners[BoundingFrustum::CORNER_COUNT]; - Corners[0] = vRightTop * vNear; - Corners[1] = vRightBottom * vNear; - Corners[2] = vLeftTop * vNear; - Corners[3] = vLeftBottom * vNear; - Corners[4] = vRightTop * vFar; - Corners[5] = vRightBottom * vFar; - Corners[6] = vLeftTop * vFar; - Corners[7] = vLeftBottom * vFar; - - XMVECTOR InsideAll = XMVectorTrueInt(); - for( size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i ) - { - XMVECTOR C = XMVector3Rotate( Corners[i], vOrientation ) + vOrigin; - XMVECTOR d = XMVector3LengthSq( XMVectorSubtract( vCenter, C ) ); - InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) ); - } - - return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Sphere vs. sphere test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingSphere::Intersects( const BoundingSphere& sh ) const -{ - // Load A. - XMVECTOR vCenterA = XMLoadFloat3( &Center ); - XMVECTOR vRadiusA = XMVectorReplicatePtr( &Radius ); - - // Load B. - XMVECTOR vCenterB = XMLoadFloat3( &sh.Center ); - XMVECTOR vRadiusB = XMVectorReplicatePtr( &sh.Radius ); - - // Distance squared between centers. - XMVECTOR Delta = vCenterB - vCenterA; - XMVECTOR DistanceSquared = XMVector3LengthSq( Delta ); - - // Sum of the radii squared. - XMVECTOR RadiusSquared = XMVectorAdd( vRadiusA, vRadiusB ); - RadiusSquared = XMVectorMultiply( RadiusSquared, RadiusSquared ); - - return XMVector3LessOrEqual( DistanceSquared, RadiusSquared ); -} - - -//----------------------------------------------------------------------------- -// Box vs. sphere test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingSphere::Intersects( const BoundingBox& box ) const -{ - return box.Intersects( *this ); -} - -_Use_decl_annotations_ -inline bool BoundingSphere::Intersects( const BoundingOrientedBox& box ) const -{ - return box.Intersects( *this ); -} - - -//----------------------------------------------------------------------------- -// Frustum vs. sphere test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingSphere::Intersects( const BoundingFrustum& fr ) const -{ - return fr.Intersects( *this ); -} - - -//----------------------------------------------------------------------------- -// Triangle vs sphere test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const -{ - // Load the sphere. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); - - // Compute the plane of the triangle (has to be normalized). - XMVECTOR N = XMVector3Normalize( XMVector3Cross( V1 - V0, V2 - V0 ) ); - - // Assert that the triangle is not degenerate. - assert( !XMVector3Equal( N, XMVectorZero() ) ); - - // Find the nearest feature on the triangle to the sphere. - XMVECTOR Dist = XMVector3Dot( vCenter - V0, N ); - - // If the center of the sphere is farther from the plane of the triangle than - // the radius of the sphere, then there cannot be an intersection. - XMVECTOR NoIntersection = XMVectorLess( Dist, -vRadius ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Dist, vRadius ) ); - - // Project the center of the sphere onto the plane of the triangle. - XMVECTOR Point = vCenter - ( N * Dist ); - - // Is it inside all the edges? If so we intersect because the distance - // to the plane is less than the radius. - XMVECTOR Intersection = DirectX::Internal::PointOnPlaneInsideTriangle( Point, V0, V1, V2 ); - - // Find the nearest point on each edge. - XMVECTOR RadiusSq = vRadius * vRadius; - - // Edge 0,1 - Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V0, V1, vCenter ); - - // If the distance to the center of the sphere to the point is less than - // the radius of the sphere then it must intersect. - Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) ); - - // Edge 1,2 - Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V1, V2, vCenter ); - - // If the distance to the center of the sphere to the point is less than - // the radius of the sphere then it must intersect. - Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) ); - - // Edge 2,0 - Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V2, V0, vCenter ); - - // If the distance to the center of the sphere to the point is less than - // the radius of the sphere then it must intersect. - Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) ); - - return XMVector4EqualInt( XMVectorAndCInt( Intersection, NoIntersection ), XMVectorTrueInt() ); -} - - -//----------------------------------------------------------------------------- -// Sphere-plane intersection -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline PlaneIntersectionType XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR Plane ) const -{ - assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); - - // Load the sphere. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); - - // Set w of the center to one so we can dot4 with a plane. - vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); - - XMVECTOR Outside, Inside; - DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane, Outside, Inside ); - - // If the sphere is outside any plane it is outside. - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return FRONT; - - // If the sphere is inside all planes it is inside. - if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) - return BACK; - - // The sphere is not inside all planes or outside a plane it intersects. - return INTERSECTING; -} - - -//----------------------------------------------------------------------------- -// Compute the intersection of a ray (Origin, Direction) with a sphere. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const -{ - assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); - - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); - - // l is the vector from the ray origin to the center of the sphere. - XMVECTOR l = vCenter - Origin; - - // s is the projection of the l onto the ray direction. - XMVECTOR s = XMVector3Dot( l, Direction ); - - XMVECTOR l2 = XMVector3Dot( l, l ); - - XMVECTOR r2 = vRadius * vRadius; - - // m2 is squared distance from the center of the sphere to the projection. - XMVECTOR m2 = l2 - s * s; - - XMVECTOR NoIntersection; - - // If the ray origin is outside the sphere and the center of the sphere is - // behind the ray origin there is no intersection. - NoIntersection = XMVectorAndInt( XMVectorLess( s, XMVectorZero() ), XMVectorGreater( l2, r2 ) ); - - // If the squared distance from the center of the sphere to the projection - // is greater than the radius squared the ray will miss the sphere. - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( m2, r2 ) ); - - // The ray hits the sphere, compute the nearest intersection point. - XMVECTOR q = XMVectorSqrt( r2 - m2 ); - XMVECTOR t1 = s - q; - XMVECTOR t2 = s + q; - - XMVECTOR OriginInside = XMVectorLessOrEqual( l2, r2 ); - XMVECTOR t = XMVectorSelect( t1, t2, OriginInside ); - - if( XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ) ) - { - // Store the x-component to *pDist. - XMStoreFloat( &Dist, t ); - return true; - } - - Dist = 0.f; - return false; -} - - -//----------------------------------------------------------------------------- -// Test a sphere vs 6 planes (typically forming a frustum). -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV BoundingSphere::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, - GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const -{ - // Load the sphere. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); - - // Set w of the center to one so we can dot4 with a plane. - vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); - - XMVECTOR Outside, Inside; - - // Test against each plane. - DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane0, Outside, Inside ); - - XMVECTOR AnyOutside = Outside; - XMVECTOR AllInside = Inside; - - DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane1, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane2, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane3, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane4, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane5, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - // If the sphere is outside any plane it is outside. - if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) - return DISJOINT; - - // If the sphere is inside all planes it is inside. - if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) - return CONTAINS; - - // The sphere is not inside all planes or outside a plane, it may intersect. - return INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Creates a bounding sphere that contains two other bounding spheres -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingSphere::CreateMerged( BoundingSphere& Out, const BoundingSphere& S1, const BoundingSphere& S2 ) -{ - XMVECTOR Center1 = XMLoadFloat3( &S1.Center ); - float r1 = S1.Radius; - - XMVECTOR Center2 = XMLoadFloat3( &S2.Center ); - float r2 = S2.Radius; - - XMVECTOR V = XMVectorSubtract( Center2, Center1 ); - - XMVECTOR Dist = XMVector3Length( V ); - - float d = XMVectorGetX(Dist); - - if ( r1 + r2 >= d ) - { - if ( r1 - r2 >= d ) - { - Out = S1; - return; - } - else if ( r2 - r1 >= d ) - { - Out = S2; - return; - } - } - - XMVECTOR N = XMVectorDivide( V, Dist ); - - float t1 = XMMin( -r1, d-r2 ); - float t2 = XMMax( r1, d+r2 ); - float t_5 = (t2 - t1) * 0.5f; - - XMVECTOR NCenter = XMVectorAdd( Center1, XMVectorMultiply( N, XMVectorReplicate( t_5 + t1 ) ) ); - - XMStoreFloat3( &Out.Center, NCenter ); - Out.Radius = t_5; -} - - -//----------------------------------------------------------------------------- -// Create sphere enscribing bounding box -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingSphere::CreateFromBoundingBox( BoundingSphere& Out, const BoundingBox& box ) -{ - Out.Center = box.Center; - XMVECTOR vExtents = XMLoadFloat3( &box.Extents ); - Out.Radius = XMVectorGetX( XMVector3Length( vExtents ) ); -} - -_Use_decl_annotations_ -inline void BoundingSphere::CreateFromBoundingBox( BoundingSphere& Out, const BoundingOrientedBox& box ) -{ - // Bounding box orientation is irrelevant because a sphere is rotationally invariant - Out.Center = box.Center; - XMVECTOR vExtents = XMLoadFloat3( &box.Extents ); - Out.Radius = XMVectorGetX( XMVector3Length( vExtents ) ); -} - - -//----------------------------------------------------------------------------- -// Find the approximate smallest enclosing bounding sphere for a set of -// points. Exact computation of the smallest enclosing bounding sphere is -// possible but is slower and requires a more complex algorithm. -// The algorithm is based on Jack Ritter, "An Efficient Bounding Sphere", -// Graphics Gems. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingSphere::CreateFromPoints( BoundingSphere& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride ) -{ - assert( Count > 0 ); - assert( pPoints ); - - // Find the points with minimum and maximum x, y, and z - XMVECTOR MinX, MaxX, MinY, MaxY, MinZ, MaxZ; - - MinX = MaxX = MinY = MaxY = MinZ = MaxZ = XMLoadFloat3( pPoints ); - - for( size_t i = 1; i < Count; ++i ) - { - XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ); - - float px = XMVectorGetX( Point ); - float py = XMVectorGetY( Point ); - float pz = XMVectorGetZ( Point ); - - if( px < XMVectorGetX( MinX ) ) - MinX = Point; - - if( px > XMVectorGetX( MaxX ) ) - MaxX = Point; - - if( py < XMVectorGetY( MinY ) ) - MinY = Point; - - if( py > XMVectorGetY( MaxY ) ) - MaxY = Point; - - if( pz < XMVectorGetZ( MinZ ) ) - MinZ = Point; - - if( pz > XMVectorGetZ( MaxZ ) ) - MaxZ = Point; - } - - // Use the min/max pair that are farthest apart to form the initial sphere. - XMVECTOR DeltaX = MaxX - MinX; - XMVECTOR DistX = XMVector3Length( DeltaX ); - - XMVECTOR DeltaY = MaxY - MinY; - XMVECTOR DistY = XMVector3Length( DeltaY ); - - XMVECTOR DeltaZ = MaxZ - MinZ; - XMVECTOR DistZ = XMVector3Length( DeltaZ ); - - XMVECTOR vCenter; - XMVECTOR vRadius; - - if( XMVector3Greater( DistX, DistY ) ) - { - if( XMVector3Greater( DistX, DistZ ) ) - { - // Use min/max x. - vCenter = XMVectorLerp(MaxX,MinX,0.5f); - vRadius = DistX * 0.5f; - } - else - { - // Use min/max z. - vCenter = XMVectorLerp(MaxZ,MinZ,0.5f); - vRadius = DistZ * 0.5f; - } - } - else // Y >= X - { - if( XMVector3Greater( DistY, DistZ ) ) - { - // Use min/max y. - vCenter = XMVectorLerp(MaxY,MinY,0.5f); - vRadius = DistY * 0.5f; - } - else - { - // Use min/max z. - vCenter = XMVectorLerp(MaxZ,MinZ,0.5f); - vRadius = DistZ * 0.5f; - } - } - - // Add any points not inside the sphere. - for( size_t i = 0; i < Count; ++i ) - { - XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ); - - XMVECTOR Delta = Point - vCenter; - - XMVECTOR Dist = XMVector3Length( Delta ); - - if( XMVector3Greater( Dist, vRadius ) ) - { - // Adjust sphere to include the new point. - vRadius = ( vRadius + Dist ) * 0.5f; - vCenter += ( XMVectorReplicate( 1.0f ) - XMVectorDivide(vRadius,Dist) ) * Delta; - } - } - - XMStoreFloat3( &Out.Center, vCenter ); - XMStoreFloat( &Out.Radius, vRadius ); -} - - -//----------------------------------------------------------------------------- -// Create sphere containing frustum -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingSphere::CreateFromFrustum( BoundingSphere& Out, const BoundingFrustum& fr ) -{ - XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; - fr.GetCorners( Corners ); - CreateFromPoints( Out, BoundingFrustum::CORNER_COUNT, Corners, sizeof(XMFLOAT3) ); -} - - -/**************************************************************************** - * - * BoundingBox - * - ****************************************************************************/ - -//----------------------------------------------------------------------------- -// Transform an axis aligned box by an angle preserving transform. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void XM_CALLCONV BoundingBox::Transform( BoundingBox& Out, FXMMATRIX M ) const -{ - // Load center and extents. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - - // Compute and transform the corners and find new min/max bounds. - XMVECTOR Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[0], vCenter ); - Corner = XMVector3Transform( Corner, M ); - - XMVECTOR Min, Max; - Min = Max = Corner; - - for( size_t i = 1; i < CORNER_COUNT; ++i ) - { - Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter ); - Corner = XMVector3Transform( Corner, M ); - - Min = XMVectorMin( Min, Corner ); - Max = XMVectorMax( Max, Corner ); - } - - // Store center and extents. - XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); - XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); -} - -_Use_decl_annotations_ -inline void XM_CALLCONV BoundingBox::Transform( BoundingBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const -{ - assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) ); - - // Load center and extents. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - - XMVECTOR VectorScale = XMVectorReplicate( Scale ); - - // Compute and transform the corners and find new min/max bounds. - XMVECTOR Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[0], vCenter ); - Corner = XMVector3Rotate( Corner * VectorScale, Rotation ) + Translation; - - XMVECTOR Min, Max; - Min = Max = Corner; - - for( size_t i = 1; i < CORNER_COUNT; ++i ) - { - Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter ); - Corner = XMVector3Rotate( Corner * VectorScale, Rotation ) + Translation; - - Min = XMVectorMin( Min, Corner ); - Max = XMVectorMax( Max, Corner ); - } - - // Store center and extents. - XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); - XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); -} - - -//----------------------------------------------------------------------------- -// Get the corner points of the box -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingBox::GetCorners( XMFLOAT3* Corners ) const -{ - assert( Corners != nullptr ); - - // Load the box - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - - for( size_t i = 0; i < CORNER_COUNT; ++i ) - { - XMVECTOR C = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter ); - XMStoreFloat3( &Corners[i], C ); - } -} - - -//----------------------------------------------------------------------------- -// Point in axis-aligned box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV BoundingBox::Contains( FXMVECTOR Point ) const -{ - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - - return XMVector3InBounds( Point - vCenter, vExtents ) ? CONTAINS : DISJOINT; -} - - -//----------------------------------------------------------------------------- -// Triangle in axis-aligned box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV BoundingBox::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const -{ - if ( !Intersects(V0,V1,V2) ) - return DISJOINT; - - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - - XMVECTOR d = XMVectorAbs( V0 - vCenter ); - XMVECTOR Inside = XMVectorLessOrEqual( d, vExtents ); - - d = XMVectorAbs( V1 - vCenter ); - Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); - - d = XMVectorAbs( V2 - vCenter ); - Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); - - return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Sphere in axis-aligned box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingBox::Contains( const BoundingSphere& sh ) const -{ - XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); - XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); - - XMVECTOR BoxCenter = XMLoadFloat3( &Center ); - XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); - - XMVECTOR BoxMin = BoxCenter - BoxExtents; - XMVECTOR BoxMax = BoxCenter + BoxExtents; - - // Find the distance to the nearest point on the box. - // for each i in (x, y, z) - // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 - // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 - - XMVECTOR d = XMVectorZero(); - - // Compute d for each dimension. - XMVECTOR LessThanMin = XMVectorLess( SphereCenter, BoxMin ); - XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxMax ); - - XMVECTOR MinDelta = SphereCenter - BoxMin; - XMVECTOR MaxDelta = SphereCenter - BoxMax; - - // Choose value for each dimension based on the comparison. - d = XMVectorSelect( d, MinDelta, LessThanMin ); - d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); - - // Use a dot-product to square them and sum them together. - XMVECTOR d2 = XMVector3Dot( d, d ); - - if ( XMVector3Greater( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ) ) - return DISJOINT; - - XMVECTOR InsideAll = XMVectorLessOrEqual( BoxMin + SphereRadius, SphereCenter ); - InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( SphereCenter, BoxMax - SphereRadius ) ); - InsideAll = XMVectorAndInt( InsideAll, XMVectorGreater( BoxMax - BoxMin, SphereRadius ) ); - - return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Axis-aligned box in axis-aligned box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingBox::Contains( const BoundingBox& box ) const -{ - XMVECTOR CenterA = XMLoadFloat3( &Center ); - XMVECTOR ExtentsA = XMLoadFloat3( &Extents ); - - XMVECTOR CenterB = XMLoadFloat3( &box.Center ); - XMVECTOR ExtentsB = XMLoadFloat3( &box.Extents ); - - XMVECTOR MinA = CenterA - ExtentsA; - XMVECTOR MaxA = CenterA + ExtentsA; - - XMVECTOR MinB = CenterB - ExtentsB; - XMVECTOR MaxB = CenterB + ExtentsB; - - // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false - XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( MinA, MaxB ), XMVectorGreater( MinB, MaxA ) ); - - if ( DirectX::Internal::XMVector3AnyTrue( Disjoint ) ) - return DISJOINT; - - // for each i in (x, y, z) if a_min(i) <= b_min(i) and b_max(i) <= a_max(i) then A contains B - XMVECTOR Inside = XMVectorAndInt( XMVectorLessOrEqual( MinA, MinB ), XMVectorLessOrEqual( MaxB, MaxA ) ); - - return DirectX::Internal::XMVector3AllTrue( Inside ) ? CONTAINS : INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Oriented box in axis-aligned box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingBox::Contains( const BoundingOrientedBox& box ) const -{ - if ( !box.Intersects( *this ) ) - return DISJOINT; - - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - - // Subtract off the AABB center to remove a subtract below - XMVECTOR oCenter = XMLoadFloat3( &box.Center ) - vCenter; - - XMVECTOR oExtents = XMLoadFloat3( &box.Extents ); - XMVECTOR oOrientation = XMLoadFloat4( &box.Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( oOrientation ) ); - - XMVECTOR Inside = XMVectorTrueInt(); - - for( size_t i=0; i < BoundingOrientedBox::CORNER_COUNT; ++i ) - { - XMVECTOR C = XMVector3Rotate( oExtents * g_BoxOffset[i], oOrientation ) + oCenter; - XMVECTOR d = XMVectorAbs(C); - Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); - } - - return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Frustum in axis-aligned box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingBox::Contains( const BoundingFrustum& fr ) const -{ - if ( !fr.Intersects( *this ) ) - return DISJOINT; - - XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; - fr.GetCorners( Corners ); - - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - - XMVECTOR Inside = XMVectorTrueInt(); - - for( size_t i=0; i < BoundingFrustum::CORNER_COUNT; ++i ) - { - XMVECTOR Point = XMLoadFloat3( &Corners[i] ); - XMVECTOR d = XMVectorAbs( Point - vCenter ); - Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); - } - - return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Sphere vs axis-aligned box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingBox::Intersects( const BoundingSphere& sh ) const -{ - XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); - XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); - - XMVECTOR BoxCenter = XMLoadFloat3( &Center ); - XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); - - XMVECTOR BoxMin = BoxCenter - BoxExtents; - XMVECTOR BoxMax = BoxCenter + BoxExtents; - - // Find the distance to the nearest point on the box. - // for each i in (x, y, z) - // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 - // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 - - XMVECTOR d = XMVectorZero(); - - // Compute d for each dimension. - XMVECTOR LessThanMin = XMVectorLess( SphereCenter, BoxMin ); - XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxMax ); - - XMVECTOR MinDelta = SphereCenter - BoxMin; - XMVECTOR MaxDelta = SphereCenter - BoxMax; - - // Choose value for each dimension based on the comparison. - d = XMVectorSelect( d, MinDelta, LessThanMin ); - d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); - - // Use a dot-product to square them and sum them together. - XMVECTOR d2 = XMVector3Dot( d, d ); - - return XMVector3LessOrEqual( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ); -} - - -//----------------------------------------------------------------------------- -// Axis-aligned box vs. axis-aligned box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingBox::Intersects( const BoundingBox& box ) const -{ - XMVECTOR CenterA = XMLoadFloat3( &Center ); - XMVECTOR ExtentsA = XMLoadFloat3( &Extents ); - - XMVECTOR CenterB = XMLoadFloat3( &box.Center ); - XMVECTOR ExtentsB = XMLoadFloat3( &box.Extents ); - - XMVECTOR MinA = CenterA - ExtentsA; - XMVECTOR MaxA = CenterA + ExtentsA; - - XMVECTOR MinB = CenterB - ExtentsB; - XMVECTOR MaxB = CenterB + ExtentsB; - - // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false - XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( MinA, MaxB ), XMVectorGreater( MinB, MaxA ) ); - - return !DirectX::Internal::XMVector3AnyTrue( Disjoint ); -} - - -//----------------------------------------------------------------------------- -// Oriented box vs. axis-aligned box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingBox::Intersects( const BoundingOrientedBox& box ) const -{ - return box.Intersects( *this ); -} - - -//----------------------------------------------------------------------------- -// Frustum vs. axis-aligned box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingBox::Intersects( const BoundingFrustum& fr ) const -{ - return fr.Intersects( *this ); -} - - -//----------------------------------------------------------------------------- -// Triangle vs. axis aligned box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool XM_CALLCONV BoundingBox::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const -{ - XMVECTOR Zero = XMVectorZero(); - - // Load the box. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - - XMVECTOR BoxMin = vCenter - vExtents; - XMVECTOR BoxMax = vCenter + vExtents; - - // Test the axes of the box (in effect test the AAB against the minimal AAB - // around the triangle). - XMVECTOR TriMin = XMVectorMin( XMVectorMin( V0, V1 ), V2 ); - XMVECTOR TriMax = XMVectorMax( XMVectorMax( V0, V1 ), V2 ); - - // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then disjoint - XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( TriMin, BoxMax ), XMVectorGreater( BoxMin, TriMax ) ); - if( DirectX::Internal::XMVector3AnyTrue( Disjoint ) ) - return false; - - // Test the plane of the triangle. - XMVECTOR Normal = XMVector3Cross( V1 - V0, V2 - V0 ); - XMVECTOR Dist = XMVector3Dot( Normal, V0 ); - - // Assert that the triangle is not degenerate. - assert( !XMVector3Equal( Normal, Zero ) ); - - // for each i in (x, y, z) if n(i) >= 0 then v_min(i)=b_min(i), v_max(i)=b_max(i) - // else v_min(i)=b_max(i), v_max(i)=b_min(i) - XMVECTOR NormalSelect = XMVectorGreater( Normal, Zero ); - XMVECTOR V_Min = XMVectorSelect( BoxMax, BoxMin, NormalSelect ); - XMVECTOR V_Max = XMVectorSelect( BoxMin, BoxMax, NormalSelect ); - - // if n dot v_min + d > 0 || n dot v_max + d < 0 then disjoint - XMVECTOR MinDist = XMVector3Dot( V_Min, Normal ); - XMVECTOR MaxDist = XMVector3Dot( V_Max, Normal ); - - XMVECTOR NoIntersection = XMVectorGreater( MinDist, Dist ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( MaxDist, Dist ) ); - - // Move the box center to zero to simplify the following tests. - XMVECTOR TV0 = V0 - vCenter; - XMVECTOR TV1 = V1 - vCenter; - XMVECTOR TV2 = V2 - vCenter; - - // Test the edge/edge axes (3*3). - XMVECTOR e0 = TV1 - TV0; - XMVECTOR e1 = TV2 - TV1; - XMVECTOR e2 = TV0 - TV2; - - // Make w zero. - e0 = XMVectorInsert<0, 0, 0, 0, 1>( e0, Zero ); - e1 = XMVectorInsert<0, 0, 0, 0, 1>( e1, Zero ); - e2 = XMVectorInsert<0, 0, 0, 0, 1>( e2, Zero ); - - XMVECTOR Axis; - XMVECTOR p0, p1, p2; - XMVECTOR Min, Max; - XMVECTOR Radius; - - // Axis == (1,0,0) x e0 = (0, -e0.z, e0.y) - Axis = XMVectorPermute( e0, -e0 ); - p0 = XMVector3Dot( TV0, Axis ); - // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; - p2 = XMVector3Dot( TV2, Axis ); - Min = XMVectorMin( p0, p2 ); - Max = XMVectorMax( p0, p2 ); - Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); - - // Axis == (1,0,0) x e1 = (0, -e1.z, e1.y) - Axis = XMVectorPermute( e1, -e1 ); - p0 = XMVector3Dot( TV0, Axis ); - p1 = XMVector3Dot( TV1, Axis ); - // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; - Min = XMVectorMin( p0, p1 ); - Max = XMVectorMax( p0, p1 ); - Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); - - // Axis == (1,0,0) x e2 = (0, -e2.z, e2.y) - Axis = XMVectorPermute( e2, -e2 ); - p0 = XMVector3Dot( TV0, Axis ); - p1 = XMVector3Dot( TV1, Axis ); - // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; - Min = XMVectorMin( p0, p1 ); - Max = XMVectorMax( p0, p1 ); - Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); - - // Axis == (0,1,0) x e0 = (e0.z, 0, -e0.x) - Axis = XMVectorPermute( e0, -e0 ); - p0 = XMVector3Dot( TV0, Axis ); - // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; - p2 = XMVector3Dot( TV2, Axis ); - Min = XMVectorMin( p0, p2 ); - Max = XMVectorMax( p0, p2 ); - Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); - - // Axis == (0,1,0) x e1 = (e1.z, 0, -e1.x) - Axis = XMVectorPermute( e1, -e1 ); - p0 = XMVector3Dot( TV0, Axis ); - p1 = XMVector3Dot( TV1, Axis ); - // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; - Min = XMVectorMin( p0, p1 ); - Max = XMVectorMax( p0, p1 ); - Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); - - // Axis == (0,0,1) x e2 = (e2.z, 0, -e2.x) - Axis = XMVectorPermute( e2, -e2 ); - p0 = XMVector3Dot( TV0, Axis ); - p1 = XMVector3Dot( TV1, Axis ); - // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; - Min = XMVectorMin( p0, p1 ); - Max = XMVectorMax( p0, p1 ); - Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); - - // Axis == (0,0,1) x e0 = (-e0.y, e0.x, 0) - Axis = XMVectorPermute( e0, -e0 ); - p0 = XMVector3Dot( TV0, Axis ); - // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; - p2 = XMVector3Dot( TV2, Axis ); - Min = XMVectorMin( p0, p2 ); - Max = XMVectorMax( p0, p2 ); - Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); - - // Axis == (0,0,1) x e1 = (-e1.y, e1.x, 0) - Axis = XMVectorPermute( e1, -e1 ); - p0 = XMVector3Dot( TV0, Axis ); - p1 = XMVector3Dot( TV1, Axis ); - // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; - Min = XMVectorMin( p0, p1 ); - Max = XMVectorMax( p0, p1 ); - Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); - - // Axis == (0,0,1) x e2 = (-e2.y, e2.x, 0) - Axis = XMVectorPermute( e2, -e2 ); - p0 = XMVector3Dot( TV0, Axis ); - p1 = XMVector3Dot( TV1, Axis ); - // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; - Min = XMVectorMin( p0, p1 ); - Max = XMVectorMax( p0, p1 ); - Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); - - return XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ); -} - - -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline PlaneIntersectionType XM_CALLCONV BoundingBox::Intersects( FXMVECTOR Plane ) const -{ - assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); - - // Load the box. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - - // Set w of the center to one so we can dot4 with a plane. - vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); - - XMVECTOR Outside, Inside; - DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane, Outside, Inside ); - - // If the box is outside any plane it is outside. - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return FRONT; - - // If the box is inside all planes it is inside. - if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) - return BACK; - - // The box is not inside all planes or outside a plane it intersects. - return INTERSECTING; -} - - -//----------------------------------------------------------------------------- -// Compute the intersection of a ray (Origin, Direction) with an axis aligned -// box using the slabs method. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool XM_CALLCONV BoundingBox::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const -{ - assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); - - // Load the box. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - - // Adjust ray origin to be relative to center of the box. - XMVECTOR TOrigin = vCenter - Origin; - - // Compute the dot product againt each axis of the box. - // Since the axii are (1,0,0), (0,1,0), (0,0,1) no computation is necessary. - XMVECTOR AxisDotOrigin = TOrigin; - XMVECTOR AxisDotDirection = Direction; - - // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab. - XMVECTOR IsParallel = XMVectorLessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ); - - // Test against all three axii simultaneously. - XMVECTOR InverseAxisDotDirection = XMVectorReciprocal( AxisDotDirection ); - XMVECTOR t1 = ( AxisDotOrigin - vExtents ) * InverseAxisDotDirection; - XMVECTOR t2 = ( AxisDotOrigin + vExtents ) * InverseAxisDotDirection; - - // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't - // use the results from any directions parallel to the slab. - XMVECTOR t_min = XMVectorSelect( XMVectorMin( t1, t2 ), g_FltMin, IsParallel ); - XMVECTOR t_max = XMVectorSelect( XMVectorMax( t1, t2 ), g_FltMax, IsParallel ); - - // t_min.x = maximum( t_min.x, t_min.y, t_min.z ); - // t_max.x = minimum( t_max.x, t_max.y, t_max.z ); - t_min = XMVectorMax( t_min, XMVectorSplatY( t_min ) ); // x = max(x,y) - t_min = XMVectorMax( t_min, XMVectorSplatZ( t_min ) ); // x = max(max(x,y),z) - t_max = XMVectorMin( t_max, XMVectorSplatY( t_max ) ); // x = min(x,y) - t_max = XMVectorMin( t_max, XMVectorSplatZ( t_max ) ); // x = min(min(x,y),z) - - // if ( t_min > t_max ) return false; - XMVECTOR NoIntersection = XMVectorGreater( XMVectorSplatX( t_min ), XMVectorSplatX( t_max ) ); - - // if ( t_max < 0.0f ) return false; - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorSplatX( t_max ), XMVectorZero() ) ); - - // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false; - XMVECTOR ParallelOverlap = XMVectorInBounds( AxisDotOrigin, vExtents ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorAndCInt( IsParallel, ParallelOverlap ) ); - - if( !DirectX::Internal::XMVector3AnyTrue( NoIntersection ) ) - { - // Store the x-component to *pDist - XMStoreFloat( &Dist, t_min ); - return true; - } - - Dist = 0.f; - return false; -} - - -//----------------------------------------------------------------------------- -// Test an axis alinged box vs 6 planes (typically forming a frustum). -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV BoundingBox::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, - GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const -{ - // Load the box. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - - // Set w of the center to one so we can dot4 with a plane. - vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); - - XMVECTOR Outside, Inside; - - // Test against each plane. - DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane0, Outside, Inside ); - - XMVECTOR AnyOutside = Outside; - XMVECTOR AllInside = Inside; - - DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane1, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane2, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane3, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane4, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane5, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - // If the box is outside any plane it is outside. - if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) - return DISJOINT; - - // If the box is inside all planes it is inside. - if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) - return CONTAINS; - - // The box is not inside all planes or outside a plane, it may intersect. - return INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Create axis-aligned box that contains two other bounding boxes -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingBox::CreateMerged( BoundingBox& Out, const BoundingBox& b1, const BoundingBox& b2 ) -{ - XMVECTOR b1Center = XMLoadFloat3( &b1.Center ); - XMVECTOR b1Extents = XMLoadFloat3( &b1.Extents ); - - XMVECTOR b2Center = XMLoadFloat3( &b2.Center ); - XMVECTOR b2Extents = XMLoadFloat3( &b2.Extents ); - - XMVECTOR Min = XMVectorSubtract( b1Center, b1Extents ); - Min = XMVectorMin( Min, XMVectorSubtract( b2Center, b2Extents ) ); - - XMVECTOR Max = XMVectorAdd( b1Center, b1Extents ); - Max = XMVectorMax( Max, XMVectorAdd( b2Center, b2Extents ) ); - - assert( XMVector3LessOrEqual( Min, Max ) ); - - XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); - XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); -} - - -//----------------------------------------------------------------------------- -// Create axis-aligned box that contains a bounding sphere -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingBox::CreateFromSphere( BoundingBox& Out, const BoundingSphere& sh ) -{ - XMVECTOR spCenter = XMLoadFloat3( &sh.Center ); - XMVECTOR shRadius = XMVectorReplicatePtr( &sh.Radius ); - - XMVECTOR Min = XMVectorSubtract( spCenter, shRadius ); - XMVECTOR Max = XMVectorAdd( spCenter, shRadius ); - - assert( XMVector3LessOrEqual( Min, Max ) ); - - XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); - XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); -} - - -//----------------------------------------------------------------------------- -// Create axis-aligned box from min/max points -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void XM_CALLCONV BoundingBox::CreateFromPoints( BoundingBox& Out, FXMVECTOR pt1, FXMVECTOR pt2 ) -{ - XMVECTOR Min = XMVectorMin( pt1, pt2 ); - XMVECTOR Max = XMVectorMax( pt1, pt2 ); - - // Store center and extents. - XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); - XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); -} - - -//----------------------------------------------------------------------------- -// Find the minimum axis aligned bounding box containing a set of points. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingBox::CreateFromPoints( BoundingBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride ) -{ - assert( Count > 0 ); - assert( pPoints ); - - // Find the minimum and maximum x, y, and z - XMVECTOR vMin, vMax; - - vMin = vMax = XMLoadFloat3( pPoints ); - - for( size_t i = 1; i < Count; ++i ) - { - XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ); - - vMin = XMVectorMin( vMin, Point ); - vMax = XMVectorMax( vMax, Point ); - } - - // Store center and extents. - XMStoreFloat3( &Out.Center, ( vMin + vMax ) * 0.5f ); - XMStoreFloat3( &Out.Extents, ( vMax - vMin ) * 0.5f ); -} - - -/**************************************************************************** - * - * BoundingOrientedBox - * - ****************************************************************************/ - -//----------------------------------------------------------------------------- -// Transform an oriented box by an angle preserving transform. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void XM_CALLCONV BoundingOrientedBox::Transform( BoundingOrientedBox& Out, FXMMATRIX M ) const -{ - // Load the box. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - // Composite the box rotation and the transform rotation. - XMMATRIX nM; - nM.r[0] = XMVector3Normalize( M.r[0] ); - nM.r[1] = XMVector3Normalize( M.r[1] ); - nM.r[2] = XMVector3Normalize( M.r[2] ); - nM.r[3] = g_XMIdentityR3; - XMVECTOR Rotation = XMQuaternionRotationMatrix( nM ); - vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); - - // Transform the center. - vCenter = XMVector3Transform( vCenter, M ); - - // Scale the box extents. - XMVECTOR dX = XMVector3Length( M.r[0] ); - XMVECTOR dY = XMVector3Length( M.r[1] ); - XMVECTOR dZ = XMVector3Length( M.r[2] ); - - XMVECTOR VectorScale = XMVectorSelect( dY, dX, g_XMSelect1000 ); - VectorScale = XMVectorSelect( dZ, VectorScale, g_XMSelect1100 ); - vExtents = vExtents * VectorScale; - - // Store the box. - XMStoreFloat3( &Out.Center, vCenter ); - XMStoreFloat3( &Out.Extents, vExtents ); - XMStoreFloat4( &Out.Orientation, vOrientation ); -} - -_Use_decl_annotations_ -inline void XM_CALLCONV BoundingOrientedBox::Transform( BoundingOrientedBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const -{ - assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) ); - - // Load the box. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - // Composite the box rotation and the transform rotation. - vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); - - // Transform the center. - XMVECTOR VectorScale = XMVectorReplicate( Scale ); - vCenter = XMVector3Rotate( vCenter * VectorScale, Rotation ) + Translation; - - // Scale the box extents. - vExtents = vExtents * VectorScale; - - // Store the box. - XMStoreFloat3( &Out.Center, vCenter ); - XMStoreFloat3( &Out.Extents, vExtents ); - XMStoreFloat4( &Out.Orientation, vOrientation ); -} - - -//----------------------------------------------------------------------------- -// Get the corner points of the box -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingOrientedBox::GetCorners( XMFLOAT3* Corners ) const -{ - assert( Corners != 0 ); - - // Load the box - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - for( size_t i = 0; i < CORNER_COUNT; ++i ) - { - XMVECTOR C = XMVector3Rotate( vExtents * g_BoxOffset[i], vOrientation ) + vCenter; - XMStoreFloat3( &Corners[i], C ); - } -} - - -//----------------------------------------------------------------------------- -// Point in oriented box test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains( FXMVECTOR Point ) const -{ - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - // Transform the point to be local to the box. - XMVECTOR TPoint = XMVector3InverseRotate( Point - vCenter, vOrientation ); - - return XMVector3InBounds( TPoint, vExtents ) ? CONTAINS : DISJOINT; -} - - -//----------------------------------------------------------------------------- -// Triangle in oriented bounding box -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const -{ - // Load the box center & orientation. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - // Transform the triangle vertices into the space of the box. - XMVECTOR TV0 = XMVector3InverseRotate( V0 - vCenter, vOrientation ); - XMVECTOR TV1 = XMVector3InverseRotate( V1 - vCenter, vOrientation ); - XMVECTOR TV2 = XMVector3InverseRotate( V2 - vCenter, vOrientation ); - - BoundingBox box; - box.Center = XMFLOAT3( 0.0f, 0.0f, 0.0f ); - box.Extents = Extents; - - // Use the triangle vs axis aligned box intersection routine. - return box.Contains( TV0, TV1, TV2 ); -} - - -//----------------------------------------------------------------------------- -// Sphere in oriented bounding box -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingOrientedBox::Contains( const BoundingSphere& sh ) const -{ - XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); - XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); - - XMVECTOR BoxCenter = XMLoadFloat3( &Center ); - XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); - XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); - - // Transform the center of the sphere to be local to the box. - // BoxMin = -BoxExtents - // BoxMax = +BoxExtents - SphereCenter = XMVector3InverseRotate( SphereCenter - BoxCenter, BoxOrientation ); - - // Find the distance to the nearest point on the box. - // for each i in (x, y, z) - // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 - // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 - - XMVECTOR d = XMVectorZero(); - - // Compute d for each dimension. - XMVECTOR LessThanMin = XMVectorLess( SphereCenter, -BoxExtents ); - XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxExtents ); - - XMVECTOR MinDelta = SphereCenter + BoxExtents; - XMVECTOR MaxDelta = SphereCenter - BoxExtents; - - // Choose value for each dimension based on the comparison. - d = XMVectorSelect( d, MinDelta, LessThanMin ); - d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); - - // Use a dot-product to square them and sum them together. - XMVECTOR d2 = XMVector3Dot( d, d ); - XMVECTOR SphereRadiusSq = XMVectorMultiply( SphereRadius, SphereRadius ); - - if ( XMVector4Greater( d2, SphereRadiusSq ) ) - return DISJOINT; - - // See if we are completely inside the box - XMVECTOR SMin = SphereCenter - SphereRadius; - XMVECTOR SMax = SphereCenter + SphereRadius; - - return ( XMVector3InBounds( SMin, BoxExtents ) && XMVector3InBounds( SMax, BoxExtents ) ) ? CONTAINS : INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Axis aligned box vs. oriented box. Constructs an oriented box and uses -// the oriented box vs. oriented box test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingOrientedBox::Contains( const BoundingBox& box ) const -{ - // Make the axis aligned box oriented and do an OBB vs OBB test. - BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) ); - return Contains( obox ); -} - - -//----------------------------------------------------------------------------- -// Oriented bounding box in oriented bounding box -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingOrientedBox::Contains( const BoundingOrientedBox& box ) const -{ - if ( !Intersects(box) ) - return DISJOINT; - - // Load the boxes - XMVECTOR aCenter = XMLoadFloat3( &Center ); - XMVECTOR aExtents = XMLoadFloat3( &Extents ); - XMVECTOR aOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( aOrientation ) ); - - XMVECTOR bCenter = XMLoadFloat3( &box.Center ); - XMVECTOR bExtents = XMLoadFloat3( &box.Extents ); - XMVECTOR bOrientation = XMLoadFloat4( &box.Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( bOrientation ) ); - - XMVECTOR offset = bCenter - aCenter; - - for( size_t i = 0; i < CORNER_COUNT; ++i ) - { - // Cb = rotate( bExtents * corneroffset[i], bOrientation ) + bcenter - // Ca = invrotate( Cb - aCenter, aOrientation ) - - XMVECTOR C = XMVector3Rotate( bExtents * g_BoxOffset[i], bOrientation ) + offset; - C = XMVector3InverseRotate( C , aOrientation ); - - if ( !XMVector3InBounds( C, aExtents ) ) - return INTERSECTS; - } - - return CONTAINS; -} - - -//----------------------------------------------------------------------------- -// Frustum in oriented bounding box -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingOrientedBox::Contains( const BoundingFrustum& fr ) const -{ - if ( !fr.Intersects(*this) ) - return DISJOINT; - - XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; - fr.GetCorners( Corners ); - - // Load the box - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - for( size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i ) - { - XMVECTOR C = XMVector3InverseRotate( XMLoadFloat3( &Corners[i] ) - vCenter, vOrientation ); - - if ( !XMVector3InBounds( C, vExtents ) ) - return INTERSECTS; - } - - return CONTAINS; -} - - -//----------------------------------------------------------------------------- -// Sphere vs. oriented box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingOrientedBox::Intersects( const BoundingSphere& sh ) const -{ - XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); - XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); - - XMVECTOR BoxCenter = XMLoadFloat3( &Center ); - XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); - XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); - - // Transform the center of the sphere to be local to the box. - // BoxMin = -BoxExtents - // BoxMax = +BoxExtents - SphereCenter = XMVector3InverseRotate( SphereCenter - BoxCenter, BoxOrientation ); - - // Find the distance to the nearest point on the box. - // for each i in (x, y, z) - // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 - // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 - - XMVECTOR d = XMVectorZero(); - - // Compute d for each dimension. - XMVECTOR LessThanMin = XMVectorLess( SphereCenter, -BoxExtents ); - XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxExtents ); - - XMVECTOR MinDelta = SphereCenter + BoxExtents; - XMVECTOR MaxDelta = SphereCenter - BoxExtents; - - // Choose value for each dimension based on the comparison. - d = XMVectorSelect( d, MinDelta, LessThanMin ); - d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); - - // Use a dot-product to square them and sum them together. - XMVECTOR d2 = XMVector3Dot( d, d ); - - return XMVector4LessOrEqual( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ) ? true : false; -} - - -//----------------------------------------------------------------------------- -// Axis aligned box vs. oriented box. Constructs an oriented box and uses -// the oriented box vs. oriented box test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingOrientedBox::Intersects( const BoundingBox& box ) const -{ - // Make the axis aligned box oriented and do an OBB vs OBB test. - BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) ); - return Intersects( obox ); -} - - -//----------------------------------------------------------------------------- -// Fast oriented box / oriented box intersection test using the separating axis -// theorem. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingOrientedBox::Intersects( const BoundingOrientedBox& box ) const -{ - // Build the 3x3 rotation matrix that defines the orientation of B relative to A. - XMVECTOR A_quat = XMLoadFloat4( &Orientation ); - XMVECTOR B_quat = XMLoadFloat4( &box.Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( A_quat ) ); - assert( DirectX::Internal::XMQuaternionIsUnit( B_quat ) ); - - XMVECTOR Q = XMQuaternionMultiply( A_quat, XMQuaternionConjugate( B_quat ) ); - XMMATRIX R = XMMatrixRotationQuaternion( Q ); - - // Compute the translation of B relative to A. - XMVECTOR A_cent = XMLoadFloat3( &Center ); - XMVECTOR B_cent = XMLoadFloat3( &box.Center ); - XMVECTOR t = XMVector3InverseRotate( B_cent - A_cent, A_quat ); - - // - // h(A) = extents of A. - // h(B) = extents of B. - // - // a(u) = axes of A = (1,0,0), (0,1,0), (0,0,1) - // b(u) = axes of B relative to A = (r00,r10,r20), (r01,r11,r21), (r02,r12,r22) - // - // For each possible separating axis l: - // d(A) = sum (for i = u,v,w) h(A)(i) * abs( a(i) dot l ) - // d(B) = sum (for i = u,v,w) h(B)(i) * abs( b(i) dot l ) - // if abs( t dot l ) > d(A) + d(B) then disjoint - // - - // Load extents of A and B. - XMVECTOR h_A = XMLoadFloat3( &Extents ); - XMVECTOR h_B = XMLoadFloat3( &box.Extents ); - - // Rows. Note R[0,1,2]X.w = 0. - XMVECTOR R0X = R.r[0]; - XMVECTOR R1X = R.r[1]; - XMVECTOR R2X = R.r[2]; - - R = XMMatrixTranspose( R ); - - // Columns. Note RX[0,1,2].w = 0. - XMVECTOR RX0 = R.r[0]; - XMVECTOR RX1 = R.r[1]; - XMVECTOR RX2 = R.r[2]; - - // Absolute value of rows. - XMVECTOR AR0X = XMVectorAbs( R0X ); - XMVECTOR AR1X = XMVectorAbs( R1X ); - XMVECTOR AR2X = XMVectorAbs( R2X ); - - // Absolute value of columns. - XMVECTOR ARX0 = XMVectorAbs( RX0 ); - XMVECTOR ARX1 = XMVectorAbs( RX1 ); - XMVECTOR ARX2 = XMVectorAbs( RX2 ); - - // Test each of the 15 possible seperating axii. - XMVECTOR d, d_A, d_B; - - // l = a(u) = (1, 0, 0) - // t dot l = t.x - // d(A) = h(A).x - // d(B) = h(B) dot abs(r00, r01, r02) - d = XMVectorSplatX( t ); - d_A = XMVectorSplatX( h_A ); - d_B = XMVector3Dot( h_B, AR0X ); - XMVECTOR NoIntersection = XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ); - - // l = a(v) = (0, 1, 0) - // t dot l = t.y - // d(A) = h(A).y - // d(B) = h(B) dot abs(r10, r11, r12) - d = XMVectorSplatY( t ); - d_A = XMVectorSplatY( h_A ); - d_B = XMVector3Dot( h_B, AR1X ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = a(w) = (0, 0, 1) - // t dot l = t.z - // d(A) = h(A).z - // d(B) = h(B) dot abs(r20, r21, r22) - d = XMVectorSplatZ( t ); - d_A = XMVectorSplatZ( h_A ); - d_B = XMVector3Dot( h_B, AR2X ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = b(u) = (r00, r10, r20) - // d(A) = h(A) dot abs(r00, r10, r20) - // d(B) = h(B).x - d = XMVector3Dot( t, RX0 ); - d_A = XMVector3Dot( h_A, ARX0 ); - d_B = XMVectorSplatX( h_B ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = b(v) = (r01, r11, r21) - // d(A) = h(A) dot abs(r01, r11, r21) - // d(B) = h(B).y - d = XMVector3Dot( t, RX1 ); - d_A = XMVector3Dot( h_A, ARX1 ); - d_B = XMVectorSplatY( h_B ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = b(w) = (r02, r12, r22) - // d(A) = h(A) dot abs(r02, r12, r22) - // d(B) = h(B).z - d = XMVector3Dot( t, RX2 ); - d_A = XMVector3Dot( h_A, ARX2 ); - d_B = XMVectorSplatZ( h_B ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = a(u) x b(u) = (0, -r20, r10) - // d(A) = h(A) dot abs(0, r20, r10) - // d(B) = h(B) dot abs(0, r02, r01) - d = XMVector3Dot( t, XMVectorPermute( RX0, -RX0 ) ); - d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX0 ) ); - d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR0X ) ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = a(u) x b(v) = (0, -r21, r11) - // d(A) = h(A) dot abs(0, r21, r11) - // d(B) = h(B) dot abs(r02, 0, r00) - d = XMVector3Dot( t, XMVectorPermute( RX1, -RX1 ) ); - d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX1 ) ); - d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR0X ) ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = a(u) x b(w) = (0, -r22, r12) - // d(A) = h(A) dot abs(0, r22, r12) - // d(B) = h(B) dot abs(r01, r00, 0) - d = XMVector3Dot( t, XMVectorPermute( RX2, -RX2 ) ); - d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX2 ) ); - d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR0X ) ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = a(v) x b(u) = (r20, 0, -r00) - // d(A) = h(A) dot abs(r20, 0, r00) - // d(B) = h(B) dot abs(0, r12, r11) - d = XMVector3Dot( t, XMVectorPermute( RX0, -RX0 ) ); - d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX0 ) ); - d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR1X ) ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = a(v) x b(v) = (r21, 0, -r01) - // d(A) = h(A) dot abs(r21, 0, r01) - // d(B) = h(B) dot abs(r12, 0, r10) - d = XMVector3Dot( t, XMVectorPermute( RX1, -RX1 ) ); - d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX1 ) ); - d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR1X ) ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = a(v) x b(w) = (r22, 0, -r02) - // d(A) = h(A) dot abs(r22, 0, r02) - // d(B) = h(B) dot abs(r11, r10, 0) - d = XMVector3Dot( t, XMVectorPermute( RX2, -RX2 ) ); - d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX2 ) ); - d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR1X ) ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = a(w) x b(u) = (-r10, r00, 0) - // d(A) = h(A) dot abs(r10, r00, 0) - // d(B) = h(B) dot abs(0, r22, r21) - d = XMVector3Dot( t, XMVectorPermute( RX0, -RX0 ) ); - d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX0 ) ); - d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR2X ) ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = a(w) x b(v) = (-r11, r01, 0) - // d(A) = h(A) dot abs(r11, r01, 0) - // d(B) = h(B) dot abs(r22, 0, r20) - d = XMVector3Dot( t, XMVectorPermute( RX1, -RX1 ) ); - d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX1 ) ); - d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR2X ) ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // l = a(w) x b(w) = (-r12, r02, 0) - // d(A) = h(A) dot abs(r12, r02, 0) - // d(B) = h(B) dot abs(r21, r20, 0) - d = XMVector3Dot( t, XMVectorPermute( RX2, -RX2 ) ); - d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX2 ) ); - d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR2X ) ); - NoIntersection = XMVectorOrInt( NoIntersection, - XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); - - // No seperating axis found, boxes must intersect. - return XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ) ? true : false; -} - - -//----------------------------------------------------------------------------- -// Frustum vs. oriented box test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingOrientedBox::Intersects( const BoundingFrustum& fr ) const -{ - return fr.Intersects( *this ); -} - - -//----------------------------------------------------------------------------- -// Triangle vs. oriented box test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const -{ - // Load the box center & orientation. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - // Transform the triangle vertices into the space of the box. - XMVECTOR TV0 = XMVector3InverseRotate( V0 - vCenter, vOrientation ); - XMVECTOR TV1 = XMVector3InverseRotate( V1 - vCenter, vOrientation ); - XMVECTOR TV2 = XMVector3InverseRotate( V2 - vCenter, vOrientation ); - - BoundingBox box; - box.Center = XMFLOAT3( 0.0f, 0.0f, 0.0f ); - box.Extents = Extents; - - // Use the triangle vs axis aligned box intersection routine. - return box.Intersects( TV0, TV1, TV2 ); -} - - -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline PlaneIntersectionType XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR Plane ) const -{ - assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); - - // Load the box. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); - - // Set w of the center to one so we can dot4 with a plane. - vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); - - // Build the 3x3 rotation matrix that defines the box axes. - XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation ); - - XMVECTOR Outside, Inside; - DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane, Outside, Inside ); - - // If the box is outside any plane it is outside. - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return FRONT; - - // If the box is inside all planes it is inside. - if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) - return BACK; - - // The box is not inside all planes or outside a plane it intersects. - return INTERSECTING; -} - - -//----------------------------------------------------------------------------- -// Compute the intersection of a ray (Origin, Direction) with an oriented box -// using the slabs method. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const -{ - assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); - - static const XMVECTORU32 SelectY = - { - XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 - }; - static const XMVECTORU32 SelectZ = - { - XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 - }; - - // Load the box. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - // Get the boxes normalized side directions. - XMMATRIX R = XMMatrixRotationQuaternion( vOrientation ); - - // Adjust ray origin to be relative to center of the box. - XMVECTOR TOrigin = vCenter - Origin; - - // Compute the dot product againt each axis of the box. - XMVECTOR AxisDotOrigin = XMVector3Dot( R.r[0], TOrigin ); - AxisDotOrigin = XMVectorSelect( AxisDotOrigin, XMVector3Dot( R.r[1], TOrigin ), SelectY ); - AxisDotOrigin = XMVectorSelect( AxisDotOrigin, XMVector3Dot( R.r[2], TOrigin ), SelectZ ); - - XMVECTOR AxisDotDirection = XMVector3Dot( R.r[0], Direction ); - AxisDotDirection = XMVectorSelect( AxisDotDirection, XMVector3Dot( R.r[1], Direction ), SelectY ); - AxisDotDirection = XMVectorSelect( AxisDotDirection, XMVector3Dot( R.r[2], Direction ), SelectZ ); - - // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab. - XMVECTOR IsParallel = XMVectorLessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ); - - // Test against all three axes simultaneously. - XMVECTOR InverseAxisDotDirection = XMVectorReciprocal( AxisDotDirection ); - XMVECTOR t1 = ( AxisDotOrigin - vExtents ) * InverseAxisDotDirection; - XMVECTOR t2 = ( AxisDotOrigin + vExtents ) * InverseAxisDotDirection; - - // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't - // use the results from any directions parallel to the slab. - XMVECTOR t_min = XMVectorSelect( XMVectorMin( t1, t2 ), g_FltMin, IsParallel ); - XMVECTOR t_max = XMVectorSelect( XMVectorMax( t1, t2 ), g_FltMax, IsParallel ); - - // t_min.x = maximum( t_min.x, t_min.y, t_min.z ); - // t_max.x = minimum( t_max.x, t_max.y, t_max.z ); - t_min = XMVectorMax( t_min, XMVectorSplatY( t_min ) ); // x = max(x,y) - t_min = XMVectorMax( t_min, XMVectorSplatZ( t_min ) ); // x = max(max(x,y),z) - t_max = XMVectorMin( t_max, XMVectorSplatY( t_max ) ); // x = min(x,y) - t_max = XMVectorMin( t_max, XMVectorSplatZ( t_max ) ); // x = min(min(x,y),z) - - // if ( t_min > t_max ) return false; - XMVECTOR NoIntersection = XMVectorGreater( XMVectorSplatX( t_min ), XMVectorSplatX( t_max ) ); - - // if ( t_max < 0.0f ) return false; - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorSplatX( t_max ), XMVectorZero() ) ); - - // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false; - XMVECTOR ParallelOverlap = XMVectorInBounds( AxisDotOrigin, vExtents ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorAndCInt( IsParallel, ParallelOverlap ) ); - - if( !DirectX::Internal::XMVector3AnyTrue( NoIntersection ) ) - { - // Store the x-component to *pDist - XMStoreFloat( &Dist, t_min ); - return true; - } - - Dist = 0.f; - return false; -} - - -//----------------------------------------------------------------------------- -// Test an oriented box vs 6 planes (typically forming a frustum). -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV BoundingOrientedBox::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, - GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const -{ - // Load the box. - XMVECTOR vCenter = XMLoadFloat3( &Center ); - XMVECTOR vExtents = XMLoadFloat3( &Extents ); - XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); - - // Set w of the center to one so we can dot4 with a plane. - vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); - - // Build the 3x3 rotation matrix that defines the box axes. - XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation ); - - XMVECTOR Outside, Inside; - - // Test against each plane. - DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane0, Outside, Inside ); - - XMVECTOR AnyOutside = Outside; - XMVECTOR AllInside = Inside; - - DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane1, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane2, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane3, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane4, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane5, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - // If the box is outside any plane it is outside. - if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) - return DISJOINT; - - // If the box is inside all planes it is inside. - if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) - return CONTAINS; - - // The box is not inside all planes or outside a plane, it may intersect. - return INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Create oriented bounding box from axis-aligned bounding box -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingOrientedBox::CreateFromBoundingBox( BoundingOrientedBox& Out, const BoundingBox& box ) -{ - Out.Center = box.Center; - Out.Extents = box.Extents; - Out.Orientation = XMFLOAT4( 0.f, 0.f, 0.f, 1.f ); -} - - -//----------------------------------------------------------------------------- -// Find the approximate minimum oriented bounding box containing a set of -// points. Exact computation of minimum oriented bounding box is possible but -// is slower and requires a more complex algorithm. -// The algorithm works by computing the inertia tensor of the points and then -// using the eigenvectors of the intertia tensor as the axes of the box. -// Computing the intertia tensor of the convex hull of the points will usually -// result in better bounding box but the computation is more complex. -// Exact computation of the minimum oriented bounding box is possible but the -// best know algorithm is O(N^3) and is significanly more complex to implement. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingOrientedBox::CreateFromPoints( BoundingOrientedBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride ) -{ - assert( Count > 0 ); - assert( pPoints != 0 ); - - XMVECTOR CenterOfMass = XMVectorZero(); - - // Compute the center of mass and inertia tensor of the points. - for( size_t i = 0; i < Count; ++i ) - { - XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ); - - CenterOfMass += Point; - } - - CenterOfMass *= XMVectorReciprocal( XMVectorReplicate( float( Count ) ) ); - - // Compute the inertia tensor of the points around the center of mass. - // Using the center of mass is not strictly necessary, but will hopefully - // improve the stability of finding the eigenvectors. - XMVECTOR XX_YY_ZZ = XMVectorZero(); - XMVECTOR XY_XZ_YZ = XMVectorZero(); - - for( size_t i = 0; i < Count; ++i ) - { - XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ) - CenterOfMass; - - XX_YY_ZZ += Point * Point; - - XMVECTOR XXY = XMVectorSwizzle( Point ); - XMVECTOR YZZ = XMVectorSwizzle( Point ); - - XY_XZ_YZ += XXY * YZZ; - } - - XMVECTOR v1, v2, v3; - - // Compute the eigenvectors of the inertia tensor. - DirectX::Internal::CalculateEigenVectorsFromCovarianceMatrix( XMVectorGetX( XX_YY_ZZ ), XMVectorGetY( XX_YY_ZZ ), - XMVectorGetZ( XX_YY_ZZ ), - XMVectorGetX( XY_XZ_YZ ), XMVectorGetY( XY_XZ_YZ ), - XMVectorGetZ( XY_XZ_YZ ), - &v1, &v2, &v3 ); - - // Put them in a matrix. - XMMATRIX R; - - R.r[0] = XMVectorSetW( v1, 0.f ); - R.r[1] = XMVectorSetW( v2, 0.f ); - R.r[2] = XMVectorSetW( v3, 0.f ); - R.r[3] = g_XMIdentityR3.v; - - // Multiply by -1 to convert the matrix into a right handed coordinate - // system (Det ~= 1) in case the eigenvectors form a left handed - // coordinate system (Det ~= -1) because XMQuaternionRotationMatrix only - // works on right handed matrices. - XMVECTOR Det = XMMatrixDeterminant( R ); - - if( XMVector4Less( Det, XMVectorZero() ) ) - { - R.r[0] *= g_XMNegativeOne.v; - R.r[1] *= g_XMNegativeOne.v; - R.r[2] *= g_XMNegativeOne.v; - } - - // Get the rotation quaternion from the matrix. - XMVECTOR vOrientation = XMQuaternionRotationMatrix( R ); - - // Make sure it is normal (in case the vectors are slightly non-orthogonal). - vOrientation = XMQuaternionNormalize( vOrientation ); - - // Rebuild the rotation matrix from the quaternion. - R = XMMatrixRotationQuaternion( vOrientation ); - - // Build the rotation into the rotated space. - XMMATRIX InverseR = XMMatrixTranspose( R ); - - // Find the minimum OBB using the eigenvectors as the axes. - XMVECTOR vMin, vMax; - - vMin = vMax = XMVector3TransformNormal( XMLoadFloat3( pPoints ), InverseR ); - - for( size_t i = 1; i < Count; ++i ) - { - XMVECTOR Point = XMVector3TransformNormal( XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ), - InverseR ); - - vMin = XMVectorMin( vMin, Point ); - vMax = XMVectorMax( vMax, Point ); - } - - // Rotate the center into world space. - XMVECTOR vCenter = ( vMin + vMax ) * 0.5f; - vCenter = XMVector3TransformNormal( vCenter, R ); - - // Store center, extents, and orientation. - XMStoreFloat3( &Out.Center, vCenter ); - XMStoreFloat3( &Out.Extents, ( vMax - vMin ) * 0.5f ); - XMStoreFloat4( &Out.Orientation, vOrientation ); -} - - -/**************************************************************************** - * - * BoundingFrustum - * - ****************************************************************************/ - -//----------------------------------------------------------------------------- -// Transform a frustum by an angle preserving transform. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void XM_CALLCONV BoundingFrustum::Transform( BoundingFrustum& Out, FXMMATRIX M ) const -{ - // Load the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - // Composite the frustum rotation and the transform rotation - XMMATRIX nM; - nM.r[0] = XMVector3Normalize( M.r[0] ); - nM.r[1] = XMVector3Normalize( M.r[1] ); - nM.r[2] = XMVector3Normalize( M.r[2] ); - nM.r[3] = g_XMIdentityR3; - XMVECTOR Rotation = XMQuaternionRotationMatrix( nM ); - vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); - - // Transform the center. - vOrigin = XMVector3Transform( vOrigin, M ); - - // Store the frustum. - XMStoreFloat3( &Out.Origin, vOrigin ); - XMStoreFloat4( &Out.Orientation, vOrientation ); - - // Scale the near and far distances (the slopes remain the same). - XMVECTOR dX = XMVector3Dot( M.r[0], M.r[0] ); - XMVECTOR dY = XMVector3Dot( M.r[1], M.r[1] ); - XMVECTOR dZ = XMVector3Dot( M.r[2], M.r[2] ); - - XMVECTOR d = XMVectorMax( dX, XMVectorMax( dY, dZ ) ); - float Scale = sqrtf( XMVectorGetX(d) ); - - Out.Near = Near * Scale; - Out.Far = Far * Scale; - - // Copy the slopes. - Out.RightSlope = RightSlope; - Out.LeftSlope = LeftSlope; - Out.TopSlope = TopSlope; - Out.BottomSlope = BottomSlope; -} - -_Use_decl_annotations_ -inline void XM_CALLCONV BoundingFrustum::Transform( BoundingFrustum& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const -{ - assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) ); - - // Load the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - // Composite the frustum rotation and the transform rotation. - vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); - - // Transform the origin. - vOrigin = XMVector3Rotate( vOrigin * XMVectorReplicate( Scale ), Rotation ) + Translation; - - // Store the frustum. - XMStoreFloat3( &Out.Origin, vOrigin ); - XMStoreFloat4( &Out.Orientation, vOrientation ); - - // Scale the near and far distances (the slopes remain the same). - Out.Near = Near * Scale; - Out.Far = Far * Scale; - - // Copy the slopes. - Out.RightSlope = RightSlope; - Out.LeftSlope = LeftSlope; - Out.TopSlope = TopSlope; - Out.BottomSlope = BottomSlope; -} - - -//----------------------------------------------------------------------------- -// Get the corner points of the frustum -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingFrustum::GetCorners( XMFLOAT3* Corners ) const -{ - assert( Corners != 0 ); - - // Load origin and orientation of the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - // Build the corners of the frustum. - XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR vNear = XMVectorReplicatePtr( &Near ); - XMVECTOR vFar = XMVectorReplicatePtr( &Far ); - - // Returns 8 corners position of bounding frustum. - // Near Far - // 0----1 4----5 - // | | | | - // | | | | - // 3----2 7----6 - - XMVECTOR vCorners[CORNER_COUNT]; - vCorners[0] = vLeftTop * vNear; - vCorners[1] = vRightTop * vNear; - vCorners[2] = vRightBottom * vNear; - vCorners[3] = vLeftBottom * vNear; - vCorners[4] = vLeftTop * vFar; - vCorners[5] = vRightTop * vFar; - vCorners[6] = vRightBottom * vFar; - vCorners[7] = vLeftBottom * vFar; - - for( size_t i=0; i < CORNER_COUNT; ++i ) - { - XMVECTOR C = XMVector3Rotate( vCorners[i], vOrientation ) + vOrigin; - XMStoreFloat3( &Corners[i], C ); - } -} - - -//----------------------------------------------------------------------------- -// Point in frustum test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV BoundingFrustum::Contains( FXMVECTOR Point ) const -{ - // Build frustum planes. - XMVECTOR Planes[6]; - Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); - Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); - Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); - Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); - Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); - Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); - - // Load origin and orientation. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - // Transform point into local space of frustum. - XMVECTOR TPoint = XMVector3InverseRotate( Point - vOrigin, vOrientation ); - - // Set w to one. - TPoint = XMVectorInsert<0, 0, 0, 0, 1>( TPoint, XMVectorSplatOne() ); - - XMVECTOR Zero = XMVectorZero(); - XMVECTOR Outside = Zero; - - // Test point against each plane of the frustum. - for( size_t i = 0; i < 6; ++i ) - { - XMVECTOR Dot = XMVector4Dot( TPoint, Planes[i] ); - Outside = XMVectorOrInt( Outside, XMVectorGreater( Dot, Zero ) ); - } - - return XMVector4NotEqualInt( Outside, XMVectorTrueInt() ) ? CONTAINS : DISJOINT; -} - - -//----------------------------------------------------------------------------- -// Triangle vs frustum test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV BoundingFrustum::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const -{ - // Load origin and orientation of the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - // Create 6 planes (do it inline to encourage use of registers) - XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); - NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); - NearPlane = XMPlaneNormalize( NearPlane ); - - XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); - FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); - FarPlane = XMPlaneNormalize( FarPlane ); - - XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); - RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); - RightPlane = XMPlaneNormalize( RightPlane ); - - XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); - LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); - LeftPlane = XMPlaneNormalize( LeftPlane ); - - XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); - TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); - TopPlane = XMPlaneNormalize( TopPlane ); - - XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); - BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); - BottomPlane = XMPlaneNormalize( BottomPlane ); - - return TriangleTests::ContainedBy( V0, V1, V2, NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); -} - - -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingFrustum::Contains( const BoundingSphere& sh ) const -{ - // Load origin and orientation of the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - // Create 6 planes (do it inline to encourage use of registers) - XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); - NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); - NearPlane = XMPlaneNormalize( NearPlane ); - - XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); - FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); - FarPlane = XMPlaneNormalize( FarPlane ); - - XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); - RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); - RightPlane = XMPlaneNormalize( RightPlane ); - - XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); - LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); - LeftPlane = XMPlaneNormalize( LeftPlane ); - - XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); - TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); - TopPlane = XMPlaneNormalize( TopPlane ); - - XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); - BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); - BottomPlane = XMPlaneNormalize( BottomPlane ); - - return sh.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); -} - - -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingFrustum::Contains( const BoundingBox& box ) const -{ - // Load origin and orientation of the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - // Create 6 planes (do it inline to encourage use of registers) - XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); - NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); - NearPlane = XMPlaneNormalize( NearPlane ); - - XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); - FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); - FarPlane = XMPlaneNormalize( FarPlane ); - - XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); - RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); - RightPlane = XMPlaneNormalize( RightPlane ); - - XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); - LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); - LeftPlane = XMPlaneNormalize( LeftPlane ); - - XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); - TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); - TopPlane = XMPlaneNormalize( TopPlane ); - - XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); - BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); - BottomPlane = XMPlaneNormalize( BottomPlane ); - - return box.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); -} - - -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingFrustum::Contains( const BoundingOrientedBox& box ) const -{ - // Load origin and orientation of the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - // Create 6 planes (do it inline to encourage use of registers) - XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); - NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); - NearPlane = XMPlaneNormalize( NearPlane ); - - XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); - FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); - FarPlane = XMPlaneNormalize( FarPlane ); - - XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); - RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); - RightPlane = XMPlaneNormalize( RightPlane ); - - XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); - LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); - LeftPlane = XMPlaneNormalize( LeftPlane ); - - XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); - TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); - TopPlane = XMPlaneNormalize( TopPlane ); - - XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); - BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); - BottomPlane = XMPlaneNormalize( BottomPlane ); - - return box.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); -} - - -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType BoundingFrustum::Contains( const BoundingFrustum& fr ) const -{ - // Load origin and orientation of the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - // Create 6 planes (do it inline to encourage use of registers) - XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); - NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); - NearPlane = XMPlaneNormalize( NearPlane ); - - XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); - FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); - FarPlane = XMPlaneNormalize( FarPlane ); - - XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); - RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); - RightPlane = XMPlaneNormalize( RightPlane ); - - XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); - LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); - LeftPlane = XMPlaneNormalize( LeftPlane ); - - XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); - TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); - TopPlane = XMPlaneNormalize( TopPlane ); - - XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); - BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); - BottomPlane = XMPlaneNormalize( BottomPlane ); - - return fr.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); -} - - -//----------------------------------------------------------------------------- -// Exact sphere vs frustum test. The algorithm first checks the sphere against -// the planes of the frustum, then if the plane checks were indeterminate finds -// the nearest feature (plane, line, point) on the frustum to the center of the -// sphere and compares the distance to the nearest feature to the radius of the -// sphere -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingFrustum::Intersects( const BoundingSphere& sh ) const -{ - XMVECTOR Zero = XMVectorZero(); - - // Build the frustum planes. - XMVECTOR Planes[6]; - Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); - Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); - Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); - Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); - Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); - Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); - - // Normalize the planes so we can compare to the sphere radius. - Planes[2] = XMVector3Normalize( Planes[2] ); - Planes[3] = XMVector3Normalize( Planes[3] ); - Planes[4] = XMVector3Normalize( Planes[4] ); - Planes[5] = XMVector3Normalize( Planes[5] ); - - // Load origin and orientation of the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - // Load the sphere. - XMVECTOR vCenter = XMLoadFloat3( &sh.Center ); - XMVECTOR vRadius = XMVectorReplicatePtr( &sh.Radius ); - - // Transform the center of the sphere into the local space of frustum. - vCenter = XMVector3InverseRotate( vCenter - vOrigin, vOrientation ); - - // Set w of the center to one so we can dot4 with the plane. - vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); - - // Check against each plane of the frustum. - XMVECTOR Outside = XMVectorFalseInt(); - XMVECTOR InsideAll = XMVectorTrueInt(); - XMVECTOR CenterInsideAll = XMVectorTrueInt(); - - XMVECTOR Dist[6]; - - for( size_t i = 0; i < 6; ++i ) - { - Dist[i] = XMVector4Dot( vCenter, Planes[i] ); - - // Outside the plane? - Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist[i], vRadius ) ); - - // Fully inside the plane? - InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Dist[i], -vRadius ) ); - - // Check if the center is inside the plane. - CenterInsideAll = XMVectorAndInt( CenterInsideAll, XMVectorLessOrEqual( Dist[i], Zero ) ); - } - - // If the sphere is outside any of the planes it is outside. - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return false; - - // If the sphere is inside all planes it is fully inside. - if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) - return true; - - // If the center of the sphere is inside all planes and the sphere intersects - // one or more planes then it must intersect. - if ( XMVector4EqualInt( CenterInsideAll, XMVectorTrueInt() ) ) - return true; - - // The sphere may be outside the frustum or intersecting the frustum. - // Find the nearest feature (face, edge, or corner) on the frustum - // to the sphere. - - // The faces adjacent to each face are: - static const size_t adjacent_faces[6][4] = - { - { 2, 3, 4, 5 }, // 0 - { 2, 3, 4, 5 }, // 1 - { 0, 1, 4, 5 }, // 2 - { 0, 1, 4, 5 }, // 3 - { 0, 1, 2, 3 }, // 4 - { 0, 1, 2, 3 } - }; // 5 - - XMVECTOR Intersects = XMVectorFalseInt(); - - // Check to see if the nearest feature is one of the planes. - for( size_t i = 0; i < 6; ++i ) - { - // Find the nearest point on the plane to the center of the sphere. - XMVECTOR Point = vCenter - (Planes[i] * Dist[i]); - - // Set w of the point to one. - Point = XMVectorInsert<0, 0, 0, 0, 1>( Point, XMVectorSplatOne() ); - - // If the point is inside the face (inside the adjacent planes) then - // this plane is the nearest feature. - XMVECTOR InsideFace = XMVectorTrueInt(); - - for ( size_t j = 0; j < 4; j++ ) - { - size_t plane_index = adjacent_faces[i][j]; - - InsideFace = XMVectorAndInt( InsideFace, - XMVectorLessOrEqual( XMVector4Dot( Point, Planes[plane_index] ), Zero ) ); - } - - // Since we have already checked distance from the plane we know that the - // sphere must intersect if this plane is the nearest feature. - Intersects = XMVectorOrInt( Intersects, - XMVectorAndInt( XMVectorGreater( Dist[i], Zero ), InsideFace ) ); - } - - if ( XMVector4EqualInt( Intersects, XMVectorTrueInt() ) ) - return true; - - // Build the corners of the frustum. - XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR vNear = XMVectorReplicatePtr( &Near ); - XMVECTOR vFar = XMVectorReplicatePtr( &Far ); - - XMVECTOR Corners[CORNER_COUNT]; - Corners[0] = vRightTop * vNear; - Corners[1] = vRightBottom * vNear; - Corners[2] = vLeftTop * vNear; - Corners[3] = vLeftBottom * vNear; - Corners[4] = vRightTop * vFar; - Corners[5] = vRightBottom * vFar; - Corners[6] = vLeftTop * vFar; - Corners[7] = vLeftBottom * vFar; - - // The Edges are: - static const size_t edges[12][2] = - { - { 0, 1 }, { 2, 3 }, { 0, 2 }, { 1, 3 }, // Near plane - { 4, 5 }, { 6, 7 }, { 4, 6 }, { 5, 7 }, // Far plane - { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 }, - }; // Near to far - - XMVECTOR RadiusSq = vRadius * vRadius; - - // Check to see if the nearest feature is one of the edges (or corners). - for( size_t i = 0; i < 12; ++i ) - { - size_t ei0 = edges[i][0]; - size_t ei1 = edges[i][1]; - - // Find the nearest point on the edge to the center of the sphere. - // The corners of the frustum are included as the endpoints of the edges. - XMVECTOR Point = DirectX::Internal::PointOnLineSegmentNearestPoint( Corners[ei0], Corners[ei1], vCenter ); - - XMVECTOR Delta = vCenter - Point; - - XMVECTOR DistSq = XMVector3Dot( Delta, Delta ); - - // If the distance to the center of the sphere to the point is less than - // the radius of the sphere then it must intersect. - Intersects = XMVectorOrInt( Intersects, XMVectorLessOrEqual( DistSq, RadiusSq ) ); - } - - if ( XMVector4EqualInt( Intersects, XMVectorTrueInt() ) ) - return true; - - // The sphere must be outside the frustum. - return false; -} - - -//----------------------------------------------------------------------------- -// Exact axis aligned box vs frustum test. Constructs an oriented box and uses -// the oriented box vs frustum test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingFrustum::Intersects( const BoundingBox& box ) const -{ - // Make the axis aligned box oriented and do an OBB vs frustum test. - BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) ); - return Intersects( obox ); -} - - -//----------------------------------------------------------------------------- -// Exact oriented box vs frustum test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingFrustum::Intersects( const BoundingOrientedBox& box ) const -{ - static const XMVECTORU32 SelectY = - { - XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 - }; - static const XMVECTORU32 SelectZ = - { - XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 - }; - - XMVECTOR Zero = XMVectorZero(); - - // Build the frustum planes. - XMVECTOR Planes[6]; - Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); - Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); - Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); - Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); - Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); - Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); - - // Load origin and orientation of the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR FrustumOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( FrustumOrientation ) ); - - // Load the box. - XMVECTOR Center = XMLoadFloat3( &box.Center ); - XMVECTOR Extents = XMLoadFloat3( &box.Extents ); - XMVECTOR BoxOrientation = XMLoadFloat4( &box.Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); - - // Transform the oriented box into the space of the frustum in order to - // minimize the number of transforms we have to do. - Center = XMVector3InverseRotate( Center - vOrigin, FrustumOrientation ); - BoxOrientation = XMQuaternionMultiply( BoxOrientation, XMQuaternionConjugate( FrustumOrientation ) ); - - // Set w of the center to one so we can dot4 with the plane. - Center = XMVectorInsert<0, 0, 0, 0, 1>( Center, XMVectorSplatOne() ); - - // Build the 3x3 rotation matrix that defines the box axes. - XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation ); - - // Check against each plane of the frustum. - XMVECTOR Outside = XMVectorFalseInt(); - XMVECTOR InsideAll = XMVectorTrueInt(); - XMVECTOR CenterInsideAll = XMVectorTrueInt(); - - for( size_t i = 0; i < 6; ++i ) - { - // Compute the distance to the center of the box. - XMVECTOR Dist = XMVector4Dot( Center, Planes[i] ); - - // Project the axes of the box onto the normal of the plane. Half the - // length of the projection (sometime called the "radius") is equal to - // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) - // where h(i) are extents of the box, n is the plane normal, and b(i) are the - // axes of the box. - XMVECTOR Radius = XMVector3Dot( Planes[i], R.r[0] ); - Radius = XMVectorSelect( Radius, XMVector3Dot( Planes[i], R.r[1] ), SelectY ); - Radius = XMVectorSelect( Radius, XMVector3Dot( Planes[i], R.r[2] ), SelectZ ); - Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) ); - - // Outside the plane? - Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist, Radius ) ); - - // Fully inside the plane? - InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Dist, -Radius ) ); - - // Check if the center is inside the plane. - CenterInsideAll = XMVectorAndInt( CenterInsideAll, XMVectorLessOrEqual( Dist, Zero ) ); - } - - // If the box is outside any of the planes it is outside. - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return false; - - // If the box is inside all planes it is fully inside. - if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) - return true; - - // If the center of the box is inside all planes and the box intersects - // one or more planes then it must intersect. - if ( XMVector4EqualInt( CenterInsideAll, XMVectorTrueInt() ) ) - return true; - - // Build the corners of the frustum. - XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR vNear = XMVectorReplicatePtr( &Near ); - XMVECTOR vFar = XMVectorReplicatePtr( &Far ); - - XMVECTOR Corners[CORNER_COUNT]; - Corners[0] = vRightTop * vNear; - Corners[1] = vRightBottom * vNear; - Corners[2] = vLeftTop * vNear; - Corners[3] = vLeftBottom * vNear; - Corners[4] = vRightTop * vFar; - Corners[5] = vRightBottom * vFar; - Corners[6] = vLeftTop * vFar; - Corners[7] = vLeftBottom * vFar; - - // Test against box axes (3) - { - // Find the min/max values of the projection of the frustum onto each axis. - XMVECTOR FrustumMin, FrustumMax; - - FrustumMin = XMVector3Dot( Corners[0], R.r[0] ); - FrustumMin = XMVectorSelect( FrustumMin, XMVector3Dot( Corners[0], R.r[1] ), SelectY ); - FrustumMin = XMVectorSelect( FrustumMin, XMVector3Dot( Corners[0], R.r[2] ), SelectZ ); - FrustumMax = FrustumMin; - - for( size_t i = 1; i < BoundingOrientedBox::CORNER_COUNT; ++i ) - { - XMVECTOR Temp = XMVector3Dot( Corners[i], R.r[0] ); - Temp = XMVectorSelect( Temp, XMVector3Dot( Corners[i], R.r[1] ), SelectY ); - Temp = XMVectorSelect( Temp, XMVector3Dot( Corners[i], R.r[2] ), SelectZ ); - - FrustumMin = XMVectorMin( FrustumMin, Temp ); - FrustumMax = XMVectorMax( FrustumMax, Temp ); - } - - // Project the center of the box onto the axes. - XMVECTOR BoxDist = XMVector3Dot( Center, R.r[0] ); - BoxDist = XMVectorSelect( BoxDist, XMVector3Dot( Center, R.r[1] ), SelectY ); - BoxDist = XMVectorSelect( BoxDist, XMVector3Dot( Center, R.r[2] ), SelectZ ); - - // The projection of the box onto the axis is just its Center and Extents. - // if (min > box_max || max < box_min) reject; - XMVECTOR Result = XMVectorOrInt( XMVectorGreater( FrustumMin, BoxDist + Extents ), - XMVectorLess( FrustumMax, BoxDist - Extents ) ); - - if( DirectX::Internal::XMVector3AnyTrue( Result ) ) - return false; - } - - // Test against edge/edge axes (3*6). - XMVECTOR FrustumEdgeAxis[6]; - - FrustumEdgeAxis[0] = vRightTop; - FrustumEdgeAxis[1] = vRightBottom; - FrustumEdgeAxis[2] = vLeftTop; - FrustumEdgeAxis[3] = vLeftBottom; - FrustumEdgeAxis[4] = vRightTop - vLeftTop; - FrustumEdgeAxis[5] = vLeftBottom - vLeftTop; - - for( size_t i = 0; i < 3; ++i ) - { - for( size_t j = 0; j < 6; j++ ) - { - // Compute the axis we are going to test. - XMVECTOR Axis = XMVector3Cross( R.r[i], FrustumEdgeAxis[j] ); - - // Find the min/max values of the projection of the frustum onto the axis. - XMVECTOR FrustumMin, FrustumMax; - - FrustumMin = FrustumMax = XMVector3Dot( Axis, Corners[0] ); - - for( size_t k = 1; k < CORNER_COUNT; k++ ) - { - XMVECTOR Temp = XMVector3Dot( Axis, Corners[k] ); - FrustumMin = XMVectorMin( FrustumMin, Temp ); - FrustumMax = XMVectorMax( FrustumMax, Temp ); - } - - // Project the center of the box onto the axis. - XMVECTOR Dist = XMVector3Dot( Center, Axis ); - - // Project the axes of the box onto the axis to find the "radius" of the box. - XMVECTOR Radius = XMVector3Dot( Axis, R.r[0] ); - Radius = XMVectorSelect( Radius, XMVector3Dot( Axis, R.r[1] ), SelectY ); - Radius = XMVectorSelect( Radius, XMVector3Dot( Axis, R.r[2] ), SelectZ ); - Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) ); - - // if (center > max + radius || center < min - radius) reject; - Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist, FrustumMax + Radius ) ); - Outside = XMVectorOrInt( Outside, XMVectorLess( Dist, FrustumMin - Radius ) ); - } - } - - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return false; - - // If we did not find a separating plane then the box must intersect the frustum. - return true; -} - - -//----------------------------------------------------------------------------- -// Exact frustum vs frustum test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool BoundingFrustum::Intersects( const BoundingFrustum& fr ) const -{ - // Load origin and orientation of frustum B. - XMVECTOR OriginB = XMLoadFloat3( &Origin ); - XMVECTOR OrientationB = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( OrientationB ) ); - - // Build the planes of frustum B. - XMVECTOR AxisB[6]; - AxisB[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, 0.0f ); - AxisB[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, 0.0f ); - AxisB[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); - AxisB[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); - AxisB[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); - AxisB[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); - - XMVECTOR PlaneDistB[6]; - PlaneDistB[0] = -XMVectorReplicatePtr( &Near ); - PlaneDistB[1] = XMVectorReplicatePtr( &Far ); - PlaneDistB[2] = XMVectorZero(); - PlaneDistB[3] = XMVectorZero(); - PlaneDistB[4] = XMVectorZero(); - PlaneDistB[5] = XMVectorZero(); - - // Load origin and orientation of frustum A. - XMVECTOR OriginA = XMLoadFloat3( &fr.Origin ); - XMVECTOR OrientationA = XMLoadFloat4( &fr.Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( OrientationA ) ); - - // Transform frustum A into the space of the frustum B in order to - // minimize the number of transforms we have to do. - OriginA = XMVector3InverseRotate( OriginA - OriginB, OrientationB ); - OrientationA = XMQuaternionMultiply( OrientationA, XMQuaternionConjugate( OrientationB ) ); - - // Build the corners of frustum A (in the local space of B). - XMVECTOR RightTopA = XMVectorSet( fr.RightSlope, fr.TopSlope, 1.0f, 0.0f ); - XMVECTOR RightBottomA = XMVectorSet( fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f ); - XMVECTOR LeftTopA = XMVectorSet(fr.LeftSlope,fr.TopSlope, 1.0f, 0.0f ); - XMVECTOR LeftBottomA = XMVectorSet( fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f ); - XMVECTOR NearA = XMVectorReplicatePtr( &fr.Near ); - XMVECTOR FarA = XMVectorReplicatePtr( &fr.Far ); - - RightTopA = XMVector3Rotate( RightTopA, OrientationA ); - RightBottomA = XMVector3Rotate( RightBottomA, OrientationA ); - LeftTopA = XMVector3Rotate( LeftTopA, OrientationA ); - LeftBottomA = XMVector3Rotate( LeftBottomA, OrientationA ); - - XMVECTOR CornersA[CORNER_COUNT]; - CornersA[0] = OriginA + RightTopA * NearA; - CornersA[1] = OriginA + RightBottomA * NearA; - CornersA[2] = OriginA + LeftTopA * NearA; - CornersA[3] = OriginA + LeftBottomA * NearA; - CornersA[4] = OriginA + RightTopA * FarA; - CornersA[5] = OriginA + RightBottomA * FarA; - CornersA[6] = OriginA + LeftTopA * FarA; - CornersA[7] = OriginA + LeftBottomA * FarA; - - // Check frustum A against each plane of frustum B. - XMVECTOR Outside = XMVectorFalseInt(); - XMVECTOR InsideAll = XMVectorTrueInt(); - - for( size_t i = 0; i < 6; ++i ) - { - // Find the min/max projection of the frustum onto the plane normal. - XMVECTOR Min, Max; - - Min = Max = XMVector3Dot( AxisB[i], CornersA[0] ); - - for( size_t j = 1; j < CORNER_COUNT; j++ ) - { - XMVECTOR Temp = XMVector3Dot( AxisB[i], CornersA[j] ); - Min = XMVectorMin( Min, Temp ); - Max = XMVectorMax( Max, Temp ); - } - - // Outside the plane? - Outside = XMVectorOrInt( Outside, XMVectorGreater( Min, PlaneDistB[i] ) ); - - // Fully inside the plane? - InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Max, PlaneDistB[i] ) ); - } - - // If the frustum A is outside any of the planes of frustum B it is outside. - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return false; - - // If frustum A is inside all planes of frustum B it is fully inside. - if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) - return true; - - // Build the corners of frustum B. - XMVECTOR RightTopB = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR RightBottomB = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR LeftTopB = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR LeftBottomB = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR NearB = XMVectorReplicatePtr( &Near ); - XMVECTOR FarB = XMVectorReplicatePtr( &Far ); - - XMVECTOR CornersB[BoundingFrustum::CORNER_COUNT]; - CornersB[0] = RightTopB * NearB; - CornersB[1] = RightBottomB * NearB; - CornersB[2] = LeftTopB * NearB; - CornersB[3] = LeftBottomB * NearB; - CornersB[4] = RightTopB * FarB; - CornersB[5] = RightBottomB * FarB; - CornersB[6] = LeftTopB * FarB; - CornersB[7] = LeftBottomB * FarB; - - // Build the planes of frustum A (in the local space of B). - XMVECTOR AxisA[6]; - XMVECTOR PlaneDistA[6]; - - AxisA[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, 0.0f ); - AxisA[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, 0.0f ); - AxisA[2] = XMVectorSet( 1.0f, 0.0f, -fr.RightSlope, 0.0f ); - AxisA[3] = XMVectorSet( -1.0f, 0.0f, fr.LeftSlope, 0.0f ); - AxisA[4] = XMVectorSet( 0.0f, 1.0f, -fr.TopSlope, 0.0f ); - AxisA[5] = XMVectorSet( 0.0f, -1.0f, fr.BottomSlope, 0.0f ); - - AxisA[0] = XMVector3Rotate( AxisA[0], OrientationA ); - AxisA[1] = -AxisA[0]; - AxisA[2] = XMVector3Rotate( AxisA[2], OrientationA ); - AxisA[3] = XMVector3Rotate( AxisA[3], OrientationA ); - AxisA[4] = XMVector3Rotate( AxisA[4], OrientationA ); - AxisA[5] = XMVector3Rotate( AxisA[5], OrientationA ); - - PlaneDistA[0] = XMVector3Dot( AxisA[0], CornersA[0] ); // Re-use corner on near plane. - PlaneDistA[1] = XMVector3Dot( AxisA[1], CornersA[4] ); // Re-use corner on far plane. - PlaneDistA[2] = XMVector3Dot( AxisA[2], OriginA ); - PlaneDistA[3] = XMVector3Dot( AxisA[3], OriginA ); - PlaneDistA[4] = XMVector3Dot( AxisA[4], OriginA ); - PlaneDistA[5] = XMVector3Dot( AxisA[5], OriginA ); - - // Check each axis of frustum A for a seperating plane (5). - for( size_t i = 0; i < 6; ++i ) - { - // Find the minimum projection of the frustum onto the plane normal. - XMVECTOR Min; - - Min = XMVector3Dot( AxisA[i], CornersB[0] ); - - for( size_t j = 1; j < CORNER_COUNT; j++ ) - { - XMVECTOR Temp = XMVector3Dot( AxisA[i], CornersB[j] ); - Min = XMVectorMin( Min, Temp ); - } - - // Outside the plane? - Outside = XMVectorOrInt( Outside, XMVectorGreater( Min, PlaneDistA[i] ) ); - } - - // If the frustum B is outside any of the planes of frustum A it is outside. - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return false; - - // Check edge/edge axes (6 * 6). - XMVECTOR FrustumEdgeAxisA[6]; - FrustumEdgeAxisA[0] = RightTopA; - FrustumEdgeAxisA[1] = RightBottomA; - FrustumEdgeAxisA[2] = LeftTopA; - FrustumEdgeAxisA[3] = LeftBottomA; - FrustumEdgeAxisA[4] = RightTopA - LeftTopA; - FrustumEdgeAxisA[5] = LeftBottomA - LeftTopA; - - XMVECTOR FrustumEdgeAxisB[6]; - FrustumEdgeAxisB[0] = RightTopB; - FrustumEdgeAxisB[1] = RightBottomB; - FrustumEdgeAxisB[2] = LeftTopB; - FrustumEdgeAxisB[3] = LeftBottomB; - FrustumEdgeAxisB[4] = RightTopB - LeftTopB; - FrustumEdgeAxisB[5] = LeftBottomB - LeftTopB; - - for( size_t i = 0; i < 6; ++i ) - { - for( size_t j = 0; j < 6; j++ ) - { - // Compute the axis we are going to test. - XMVECTOR Axis = XMVector3Cross( FrustumEdgeAxisA[i], FrustumEdgeAxisB[j] ); - - // Find the min/max values of the projection of both frustums onto the axis. - XMVECTOR MinA, MaxA; - XMVECTOR MinB, MaxB; - - MinA = MaxA = XMVector3Dot( Axis, CornersA[0] ); - MinB = MaxB = XMVector3Dot( Axis, CornersB[0] ); - - for( size_t k = 1; k < CORNER_COUNT; k++ ) - { - XMVECTOR TempA = XMVector3Dot( Axis, CornersA[k] ); - MinA = XMVectorMin( MinA, TempA ); - MaxA = XMVectorMax( MaxA, TempA ); - - XMVECTOR TempB = XMVector3Dot( Axis, CornersB[k] ); - MinB = XMVectorMin( MinB, TempB ); - MaxB = XMVectorMax( MaxB, TempB ); - } - - // if (MinA > MaxB || MinB > MaxA) reject - Outside = XMVectorOrInt( Outside, XMVectorGreater( MinA, MaxB ) ); - Outside = XMVectorOrInt( Outside, XMVectorGreater( MinB, MaxA ) ); - } - } - - // If there is a seperating plane, then the frustums do not intersect. - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return false; - - // If we did not find a separating plane then the frustums intersect. - return true; -} - - -//----------------------------------------------------------------------------- -// Triangle vs frustum test. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const -{ - // Build the frustum planes (NOTE: D is negated from the usual). - XMVECTOR Planes[6]; - Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, -Near ); - Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, Far ); - Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); - Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); - Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); - Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); - - // Load origin and orientation of the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - // Transform triangle into the local space of frustum. - XMVECTOR TV0 = XMVector3InverseRotate( V0 - vOrigin, vOrientation ); - XMVECTOR TV1 = XMVector3InverseRotate( V1 - vOrigin, vOrientation ); - XMVECTOR TV2 = XMVector3InverseRotate( V2 - vOrigin, vOrientation ); - - // Test each vertex of the triangle against the frustum planes. - XMVECTOR Outside = XMVectorFalseInt(); - XMVECTOR InsideAll = XMVectorTrueInt(); - - for( size_t i = 0; i < 6; ++i ) - { - XMVECTOR Dist0 = XMVector3Dot( TV0, Planes[i] ); - XMVECTOR Dist1 = XMVector3Dot( TV1, Planes[i] ); - XMVECTOR Dist2 = XMVector3Dot( TV2, Planes[i] ); - - XMVECTOR MinDist = XMVectorMin( Dist0, Dist1 ); - MinDist = XMVectorMin( MinDist, Dist2 ); - XMVECTOR MaxDist = XMVectorMax( Dist0, Dist1 ); - MaxDist = XMVectorMax( MaxDist, Dist2 ); - - XMVECTOR PlaneDist = XMVectorSplatW( Planes[i] ); - - // Outside the plane? - Outside = XMVectorOrInt( Outside, XMVectorGreater( MinDist, PlaneDist ) ); - - // Fully inside the plane? - InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( MaxDist, PlaneDist ) ); - } - - // If the triangle is outside any of the planes it is outside. - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return false; - - // If the triangle is inside all planes it is fully inside. - if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) - return true; - - // Build the corners of the frustum. - XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR vNear = XMVectorReplicatePtr( &Near ); - XMVECTOR vFar = XMVectorReplicatePtr( &Far ); - - XMVECTOR Corners[CORNER_COUNT]; - Corners[0] = vRightTop * vNear; - Corners[1] = vRightBottom * vNear; - Corners[2] = vLeftTop * vNear; - Corners[3] = vLeftBottom * vNear; - Corners[4] = vRightTop * vFar; - Corners[5] = vRightBottom * vFar; - Corners[6] = vLeftTop * vFar; - Corners[7] = vLeftBottom * vFar; - - // Test the plane of the triangle. - XMVECTOR Normal = XMVector3Cross( V1 - V0, V2 - V0 ); - XMVECTOR Dist = XMVector3Dot( Normal, V0 ); - - XMVECTOR MinDist, MaxDist; - MinDist = MaxDist = XMVector3Dot( Corners[0], Normal ); - for( size_t i = 1; i < CORNER_COUNT; ++i ) - { - XMVECTOR Temp = XMVector3Dot( Corners[i], Normal ); - MinDist = XMVectorMin( MinDist, Temp ); - MaxDist = XMVectorMax( MaxDist, Temp ); - } - - Outside = XMVectorOrInt( XMVectorGreater( MinDist, Dist ), XMVectorLess( MaxDist, Dist ) ); - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return false; - - // Check the edge/edge axes (3*6). - XMVECTOR TriangleEdgeAxis[3]; - TriangleEdgeAxis[0] = V1 - V0; - TriangleEdgeAxis[1] = V2 - V1; - TriangleEdgeAxis[2] = V0 - V2; - - XMVECTOR FrustumEdgeAxis[6]; - FrustumEdgeAxis[0] = vRightTop; - FrustumEdgeAxis[1] = vRightBottom; - FrustumEdgeAxis[2] = vLeftTop; - FrustumEdgeAxis[3] = vLeftBottom; - FrustumEdgeAxis[4] = vRightTop - vLeftTop; - FrustumEdgeAxis[5] = vLeftBottom - vLeftTop; - - for( size_t i = 0; i < 3; ++i ) - { - for( size_t j = 0; j < 6; j++ ) - { - // Compute the axis we are going to test. - XMVECTOR Axis = XMVector3Cross( TriangleEdgeAxis[i], FrustumEdgeAxis[j] ); - - // Find the min/max of the projection of the triangle onto the axis. - XMVECTOR MinA, MaxA; - - XMVECTOR Dist0 = XMVector3Dot( V0, Axis ); - XMVECTOR Dist1 = XMVector3Dot( V1, Axis ); - XMVECTOR Dist2 = XMVector3Dot( V2, Axis ); - - MinA = XMVectorMin( Dist0, Dist1 ); - MinA = XMVectorMin( MinA, Dist2 ); - MaxA = XMVectorMax( Dist0, Dist1 ); - MaxA = XMVectorMax( MaxA, Dist2 ); - - // Find the min/max of the projection of the frustum onto the axis. - XMVECTOR MinB, MaxB; - - MinB = MaxB = XMVector3Dot( Axis, Corners[0] ); - - for( size_t k = 1; k < CORNER_COUNT; k++ ) - { - XMVECTOR Temp = XMVector3Dot( Axis, Corners[k] ); - MinB = XMVectorMin( MinB, Temp ); - MaxB = XMVectorMax( MaxB, Temp ); - } - - // if (MinA > MaxB || MinB > MaxA) reject; - Outside = XMVectorOrInt( Outside, XMVectorGreater( MinA, MaxB ) ); - Outside = XMVectorOrInt( Outside, XMVectorGreater( MinB, MaxA ) ); - } - } - - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return false; - - // If we did not find a separating plane then the triangle must intersect the frustum. - return true; -} - - -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline PlaneIntersectionType XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR Plane ) const -{ - assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); - - // Load origin and orientation of the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - // Set w of the origin to one so we can dot4 with a plane. - vOrigin = XMVectorInsert<0, 0, 0, 0, 1>( vOrigin, XMVectorSplatOne() ); - - // Build the corners of the frustum (in world space). - XMVECTOR RightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR RightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR LeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR LeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR vNear = XMVectorReplicatePtr( &Near ); - XMVECTOR vFar = XMVectorReplicatePtr( &Far ); - - RightTop = XMVector3Rotate( RightTop, vOrientation ); - RightBottom = XMVector3Rotate( RightBottom, vOrientation ); - LeftTop = XMVector3Rotate( LeftTop, vOrientation ); - LeftBottom = XMVector3Rotate( LeftBottom, vOrientation ); - - XMVECTOR Corners0 = vOrigin + RightTop * vNear; - XMVECTOR Corners1 = vOrigin + RightBottom * vNear; - XMVECTOR Corners2 = vOrigin + LeftTop * vNear; - XMVECTOR Corners3 = vOrigin + LeftBottom * vNear; - XMVECTOR Corners4 = vOrigin + RightTop * vFar; - XMVECTOR Corners5 = vOrigin + RightBottom * vFar; - XMVECTOR Corners6 = vOrigin + LeftTop * vFar; - XMVECTOR Corners7 = vOrigin + LeftBottom * vFar; - - XMVECTOR Outside, Inside; - DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, - Corners4, Corners5, Corners6, Corners7, - Plane, Outside, Inside ); - - // If the frustum is outside any plane it is outside. - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return FRONT; - - // If the frustum is inside all planes it is inside. - if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) - return BACK; - - // The frustum is not inside all planes or outside a plane it intersects. - return INTERSECTING; -} - - -//----------------------------------------------------------------------------- -// Ray vs. frustum test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR rayOrigin, FXMVECTOR Direction, float& Dist ) const -{ - // If ray starts inside the frustum, return a distance of 0 for the hit - if ( Contains(rayOrigin) == CONTAINS ) - { - Dist = 0.0f; - return true; - } - - // Build the frustum planes. - XMVECTOR Planes[6]; - Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); - Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); - Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); - Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); - Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); - Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); - - // Load origin and orientation of the frustum. - XMVECTOR frOrigin = XMLoadFloat3( &Origin ); - XMVECTOR frOrientation = XMLoadFloat4( &Orientation ); - - // This algorithm based on "Fast Ray-Convex Polyhedron Intersectin," in James Arvo, ed., Graphics Gems II pp. 247-250 - float tnear = -FLT_MAX; - float tfar = FLT_MAX; - - for( size_t i=0; i < 6; ++i ) - { - XMVECTOR Plane = DirectX::Internal::XMPlaneTransform( Planes[i], frOrientation, frOrigin ); - Plane = XMPlaneNormalize( Plane ); - - XMVECTOR AxisDotOrigin = XMPlaneDotCoord( Plane, rayOrigin ); - XMVECTOR AxisDotDirection = XMVector3Dot( Plane, Direction ); - - if ( XMVector3LessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ) ) - { - // Ray is parallel to plane - check if ray origin is inside plane's - if ( XMVector3Greater( AxisDotOrigin, g_XMZero ) ) - { - // Ray origin is outside half-space. - Dist = 0.f; - return false; - } - } - else - { - // Ray not parallel - get distance to plane. - float vd = XMVectorGetX( AxisDotDirection ); - float vn = XMVectorGetX( AxisDotOrigin ); - float t = -vn / vd; - if (vd < 0.0f) - { - // Front face - T is a near point. - if (t > tfar) - { - Dist = 0.f; - return false; - } - if (t > tnear) - { - // Hit near face. - tnear = t; - } - } - else - { - // back face - T is far point. - if (t < tnear) - { - Dist = 0.f; - return false; - } - if (t < tfar) - { - // Hit far face. - tfar = t; - } - } - } - } - - // Survived all tests. - // Note: if ray originates on polyhedron, may want to change 0.0f to some - // epsilon to avoid intersecting the originating face. - float distance = ( tnear >= 0.0f ) ? tnear : tfar; - if (distance >= 0.0f) - { - Dist = distance; - return true; - } - - Dist = 0.f; - return false; -} - - -//----------------------------------------------------------------------------- -// Test a frustum vs 6 planes (typically forming another frustum). -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV BoundingFrustum::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, - GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const -{ - // Load origin and orientation of the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); - - // Set w of the origin to one so we can dot4 with a plane. - vOrigin = XMVectorInsert<0, 0, 0, 0, 1>( vOrigin, XMVectorSplatOne() ); - - // Build the corners of the frustum (in world space). - XMVECTOR RightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR RightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR LeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); - XMVECTOR LeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); - XMVECTOR vNear = XMVectorReplicatePtr( &Near ); - XMVECTOR vFar = XMVectorReplicatePtr( &Far ); - - RightTop = XMVector3Rotate( RightTop, vOrientation ); - RightBottom = XMVector3Rotate( RightBottom, vOrientation ); - LeftTop = XMVector3Rotate( LeftTop, vOrientation ); - LeftBottom = XMVector3Rotate( LeftBottom, vOrientation ); - - XMVECTOR Corners0 = vOrigin + RightTop * vNear; - XMVECTOR Corners1 = vOrigin + RightBottom * vNear; - XMVECTOR Corners2 = vOrigin + LeftTop * vNear; - XMVECTOR Corners3 = vOrigin + LeftBottom * vNear; - XMVECTOR Corners4 = vOrigin + RightTop * vFar; - XMVECTOR Corners5 = vOrigin + RightBottom * vFar; - XMVECTOR Corners6 = vOrigin + LeftTop * vFar; - XMVECTOR Corners7 = vOrigin + LeftBottom * vFar; - - XMVECTOR Outside, Inside; - - // Test against each plane. - DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, - Corners4, Corners5, Corners6, Corners7, - Plane0, Outside, Inside ); - - XMVECTOR AnyOutside = Outside; - XMVECTOR AllInside = Inside; - - DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, - Corners4, Corners5, Corners6, Corners7, - Plane1, Outside, Inside ); - - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, - Corners4, Corners5, Corners6, Corners7, - Plane2, Outside, Inside ); - - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, - Corners4, Corners5, Corners6, Corners7, - Plane3, Outside, Inside ); - - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, - Corners4, Corners5, Corners6, Corners7, - Plane4, Outside, Inside ); - - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, - Corners4, Corners5, Corners6, Corners7, - Plane5, Outside, Inside ); - - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - // If the frustum is outside any plane it is outside. - if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) - return DISJOINT; - - // If the frustum is inside all planes it is inside. - if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) - return CONTAINS; - - // The frustum is not inside all planes or outside a plane, it may intersect. - return INTERSECTS; -} - - -//----------------------------------------------------------------------------- -// Build the 6 frustum planes from a frustum. -// -// The intended use for these routines is for fast culling to a view frustum. -// When the volume being tested against a view frustum is small relative to the -// view frustum it is usually either inside all six planes of the frustum -// (CONTAINS) or outside one of the planes of the frustum (DISJOINT). If neither -// of these cases is true then it may or may not be intersecting the frustum -// (INTERSECTS) -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void BoundingFrustum::GetPlanes( XMVECTOR* NearPlane, XMVECTOR* FarPlane, XMVECTOR* RightPlane, - XMVECTOR* LeftPlane, XMVECTOR* TopPlane, XMVECTOR* BottomPlane ) const -{ - // Load origin and orientation of the frustum. - XMVECTOR vOrigin = XMLoadFloat3( &Origin ); - XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); - - if (NearPlane) - { - XMVECTOR vNearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); - vNearPlane = DirectX::Internal::XMPlaneTransform( vNearPlane, vOrientation, vOrigin ); - *NearPlane = XMPlaneNormalize( vNearPlane ); - } - - if (FarPlane) - { - XMVECTOR vFarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); - vFarPlane = DirectX::Internal::XMPlaneTransform( vFarPlane, vOrientation, vOrigin ); - *FarPlane = XMPlaneNormalize( vFarPlane ); - } - - if (RightPlane) - { - XMVECTOR vRightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); - vRightPlane = DirectX::Internal::XMPlaneTransform( vRightPlane, vOrientation, vOrigin ); - *RightPlane = XMPlaneNormalize( vRightPlane ); - } - - if (LeftPlane) - { - XMVECTOR vLeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); - vLeftPlane = DirectX::Internal::XMPlaneTransform( vLeftPlane, vOrientation, vOrigin ); - *LeftPlane = XMPlaneNormalize( vLeftPlane ); - } - - if (TopPlane) - { - XMVECTOR vTopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); - vTopPlane = DirectX::Internal::XMPlaneTransform( vTopPlane, vOrientation, vOrigin ); - *TopPlane = XMPlaneNormalize( vTopPlane ); - } - - if (BottomPlane) - { - XMVECTOR vBottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); - vBottomPlane = DirectX::Internal::XMPlaneTransform( vBottomPlane, vOrientation, vOrigin ); - *BottomPlane = XMPlaneNormalize( vBottomPlane ); - } -} - - -//----------------------------------------------------------------------------- -// Build a frustum from a persepective projection matrix. The matrix may only -// contain a projection; any rotation, translation or scale will cause the -// constructed frustum to be incorrect. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline void XM_CALLCONV BoundingFrustum::CreateFromMatrix( BoundingFrustum& Out, FXMMATRIX Projection ) -{ - // Corners of the projection frustum in homogenous space. - static XMVECTORF32 HomogenousPoints[6] = - { - { 1.0f, 0.0f, 1.0f, 1.0f }, // right (at far plane) - { -1.0f, 0.0f, 1.0f, 1.0f }, // left - { 0.0f, 1.0f, 1.0f, 1.0f }, // top - { 0.0f, -1.0f, 1.0f, 1.0f }, // bottom - - { 0.0f, 0.0f, 0.0f, 1.0f }, // near - { 0.0f, 0.0f, 1.0f, 1.0f } // far - }; - - XMVECTOR Determinant; - XMMATRIX matInverse = XMMatrixInverse( &Determinant, Projection ); - - // Compute the frustum corners in world space. - XMVECTOR Points[6]; - - for( size_t i = 0; i < 6; ++i ) - { - // Transform point. - Points[i] = XMVector4Transform( HomogenousPoints[i], matInverse ); - } - - Out.Origin = XMFLOAT3( 0.0f, 0.0f, 0.0f ); - Out.Orientation = XMFLOAT4( 0.0f, 0.0f, 0.0f, 1.0f ); - - // Compute the slopes. - Points[0] = Points[0] * XMVectorReciprocal( XMVectorSplatZ( Points[0] ) ); - Points[1] = Points[1] * XMVectorReciprocal( XMVectorSplatZ( Points[1] ) ); - Points[2] = Points[2] * XMVectorReciprocal( XMVectorSplatZ( Points[2] ) ); - Points[3] = Points[3] * XMVectorReciprocal( XMVectorSplatZ( Points[3] ) ); - - Out.RightSlope = XMVectorGetX( Points[0] ); - Out.LeftSlope = XMVectorGetX( Points[1] ); - Out.TopSlope = XMVectorGetY( Points[2] ); - Out.BottomSlope = XMVectorGetY( Points[3] ); - - // Compute near and far. - Points[4] = Points[4] * XMVectorReciprocal( XMVectorSplatW( Points[4] ) ); - Points[5] = Points[5] * XMVectorReciprocal( XMVectorSplatW( Points[5] ) ); - - Out.Near = XMVectorGetZ( Points[4] ); - Out.Far = XMVectorGetZ( Points[5] ); -} - - -/**************************************************************************** - * - * TriangleTests - * - ****************************************************************************/ - -namespace TriangleTests -{ - -//----------------------------------------------------------------------------- -// Compute the intersection of a ray (Origin, Direction) with a triangle -// (V0, V1, V2). Return true if there is an intersection and also set *pDist -// to the distance along the ray to the intersection. -// -// The algorithm is based on Moller, Tomas and Trumbore, "Fast, Minimum Storage -// Ray-Triangle Intersection", Journal of Graphics Tools, vol. 2, no. 1, -// pp 21-28, 1997. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool XM_CALLCONV Intersects( FXMVECTOR Origin, FXMVECTOR Direction, FXMVECTOR V0, GXMVECTOR V1, HXMVECTOR V2, float& Dist ) -{ - assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); - - XMVECTOR Zero = XMVectorZero(); - - XMVECTOR e1 = V1 - V0; - XMVECTOR e2 = V2 - V0; - - // p = Direction ^ e2; - XMVECTOR p = XMVector3Cross( Direction, e2 ); - - // det = e1 * p; - XMVECTOR det = XMVector3Dot( e1, p ); - - XMVECTOR u, v, t; - - if( XMVector3GreaterOrEqual( det, g_RayEpsilon ) ) - { - // Determinate is positive (front side of the triangle). - XMVECTOR s = Origin - V0; - - // u = s * p; - u = XMVector3Dot( s, p ); - - XMVECTOR NoIntersection = XMVectorLess( u, Zero ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( u, det ) ); - - // q = s ^ e1; - XMVECTOR q = XMVector3Cross( s, e1 ); - - // v = Direction * q; - v = XMVector3Dot( Direction, q ); - - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( v, Zero ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( u + v, det ) ); - - // t = e2 * q; - t = XMVector3Dot( e2, q ); - - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( t, Zero ) ); - - if( XMVector4EqualInt( NoIntersection, XMVectorTrueInt() ) ) - { - Dist = 0.f; - return false; - } - } - else if( XMVector3LessOrEqual( det, g_RayNegEpsilon ) ) - { - // Determinate is negative (back side of the triangle). - XMVECTOR s = Origin - V0; - - // u = s * p; - u = XMVector3Dot( s, p ); - - XMVECTOR NoIntersection = XMVectorGreater( u, Zero ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( u, det ) ); - - // q = s ^ e1; - XMVECTOR q = XMVector3Cross( s, e1 ); - - // v = Direction * q; - v = XMVector3Dot( Direction, q ); - - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( v, Zero ) ); - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( u + v, det ) ); - - // t = e2 * q; - t = XMVector3Dot( e2, q ); - - NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( t, Zero ) ); - - if ( XMVector4EqualInt( NoIntersection, XMVectorTrueInt() ) ) - { - Dist = 0.f; - return false; - } - } - else - { - // Parallel ray. - Dist = 0.f; - return false; - } - - t = XMVectorDivide ( t, det ); - - // (u / det) and (v / dev) are the barycentric cooridinates of the intersection. - - // Store the x-component to *pDist - XMStoreFloat( &Dist, t ); - - return true; -} - - -//----------------------------------------------------------------------------- -// Test if two triangles intersect. -// -// The final test of algorithm is based on Shen, Heng, and Tang, "A Fast -// Triangle-Triangle Overlap Test Using Signed Distances", Journal of Graphics -// Tools, vol. 8, no. 1, pp 17-23, 2003 and Guigue and Devillers, "Fast and -// Robust Triangle-Triangle Overlap Test Using Orientation Predicates", Journal -// of Graphics Tools, vol. 8, no. 1, pp 25-32, 2003. -// -// The final test could be considered an edge-edge separating plane test with -// the 9 possible cases narrowed down to the only two pairs of edges that can -// actaully result in a seperation. -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline bool XM_CALLCONV Intersects( FXMVECTOR A0, FXMVECTOR A1, FXMVECTOR A2, GXMVECTOR B0, HXMVECTOR B1, HXMVECTOR B2 ) -{ - static const XMVECTORU32 SelectY = - { - XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 - }; - static const XMVECTORU32 SelectZ = - { - XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 - }; - static const XMVECTORU32 Select0111 = - { - XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_1 - }; - static const XMVECTORU32 Select1011 = - { - XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 - }; - static const XMVECTORU32 Select1101 = - { - XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1 - }; - - XMVECTOR Zero = XMVectorZero(); - - // Compute the normal of triangle A. - XMVECTOR N1 = XMVector3Cross( A1 - A0, A2 - A0 ); - - // Assert that the triangle is not degenerate. - assert( !XMVector3Equal( N1, Zero ) ); - - // Test points of B against the plane of A. - XMVECTOR BDist = XMVector3Dot( N1, B0 - A0 ); - BDist = XMVectorSelect( BDist, XMVector3Dot( N1, B1 - A0 ), SelectY ); - BDist = XMVectorSelect( BDist, XMVector3Dot( N1, B2 - A0 ), SelectZ ); - - // Ensure robustness with co-planar triangles by zeroing small distances. - uint32_t BDistIsZeroCR; - XMVECTOR BDistIsZero = XMVectorGreaterR( &BDistIsZeroCR, g_RayEpsilon, XMVectorAbs( BDist ) ); - BDist = XMVectorSelect( BDist, Zero, BDistIsZero ); - - uint32_t BDistIsLessCR; - XMVECTOR BDistIsLess = XMVectorGreaterR( &BDistIsLessCR, Zero, BDist ); - - uint32_t BDistIsGreaterCR; - XMVECTOR BDistIsGreater = XMVectorGreaterR( &BDistIsGreaterCR, BDist, Zero ); - - // If all the points are on the same side we don't intersect. - if( XMComparisonAllTrue( BDistIsLessCR ) || XMComparisonAllTrue( BDistIsGreaterCR ) ) - return false; - - // Compute the normal of triangle B. - XMVECTOR N2 = XMVector3Cross( B1 - B0, B2 - B0 ); - - // Assert that the triangle is not degenerate. - assert( !XMVector3Equal( N2, Zero ) ); - - // Test points of A against the plane of B. - XMVECTOR ADist = XMVector3Dot( N2, A0 - B0 ); - ADist = XMVectorSelect( ADist, XMVector3Dot( N2, A1 - B0 ), SelectY ); - ADist = XMVectorSelect( ADist, XMVector3Dot( N2, A2 - B0 ), SelectZ ); - - // Ensure robustness with co-planar triangles by zeroing small distances. - uint32_t ADistIsZeroCR; - XMVECTOR ADistIsZero = XMVectorGreaterR( &ADistIsZeroCR, g_RayEpsilon, XMVectorAbs( BDist ) ); - ADist = XMVectorSelect( ADist, Zero, ADistIsZero ); - - uint32_t ADistIsLessCR; - XMVECTOR ADistIsLess = XMVectorGreaterR( &ADistIsLessCR, Zero, ADist ); - - uint32_t ADistIsGreaterCR; - XMVECTOR ADistIsGreater = XMVectorGreaterR( &ADistIsGreaterCR, ADist, Zero ); - - // If all the points are on the same side we don't intersect. - if( XMComparisonAllTrue( ADistIsLessCR ) || XMComparisonAllTrue( ADistIsGreaterCR ) ) - return false; - - // Special case for co-planar triangles. - if( XMComparisonAllTrue( ADistIsZeroCR ) || XMComparisonAllTrue( BDistIsZeroCR ) ) - { - XMVECTOR Axis, Dist, MinDist; - - // Compute an axis perpindicular to the edge (points out). - Axis = XMVector3Cross( N1, A1 - A0 ); - Dist = XMVector3Dot( Axis, A0 ); - - // Test points of B against the axis. - MinDist = XMVector3Dot( B0, Axis ); - MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) ); - MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) ); - if( XMVector4GreaterOrEqual( MinDist, Dist ) ) - return false; - - // Edge (A1, A2) - Axis = XMVector3Cross( N1, A2 - A1 ); - Dist = XMVector3Dot( Axis, A1 ); - - MinDist = XMVector3Dot( B0, Axis ); - MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) ); - MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) ); - if( XMVector4GreaterOrEqual( MinDist, Dist ) ) - return false; - - // Edge (A2, A0) - Axis = XMVector3Cross( N1, A0 - A2 ); - Dist = XMVector3Dot( Axis, A2 ); - - MinDist = XMVector3Dot( B0, Axis ); - MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) ); - MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) ); - if( XMVector4GreaterOrEqual( MinDist, Dist ) ) - return false; - - // Edge (B0, B1) - Axis = XMVector3Cross( N2, B1 - B0 ); - Dist = XMVector3Dot( Axis, B0 ); - - MinDist = XMVector3Dot( A0, Axis ); - MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) ); - MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) ); - if( XMVector4GreaterOrEqual( MinDist, Dist ) ) - return false; - - // Edge (B1, B2) - Axis = XMVector3Cross( N2, B2 - B1 ); - Dist = XMVector3Dot( Axis, B1 ); - - MinDist = XMVector3Dot( A0, Axis ); - MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) ); - MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) ); - if( XMVector4GreaterOrEqual( MinDist, Dist ) ) - return false; - - // Edge (B2,B0) - Axis = XMVector3Cross( N2, B0 - B2 ); - Dist = XMVector3Dot( Axis, B2 ); - - MinDist = XMVector3Dot( A0, Axis ); - MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) ); - MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) ); - if( XMVector4GreaterOrEqual( MinDist, Dist ) ) - return false; - - return true; - } - - // - // Find the single vertex of A and B (ie the vertex on the opposite side - // of the plane from the other two) and reorder the edges so we can compute - // the signed edge/edge distances. - // - // if ( (V0 >= 0 && V1 < 0 && V2 < 0) || - // (V0 > 0 && V1 <= 0 && V2 <= 0) || - // (V0 <= 0 && V1 > 0 && V2 > 0) || - // (V0 < 0 && V1 >= 0 && V2 >= 0) ) then V0 is singular; - // - // If our singular vertex is not on the positive side of the plane we reverse - // the triangle winding so that the overlap comparisons will compare the - // correct edges with the correct signs. - // - XMVECTOR ADistIsLessEqual = XMVectorOrInt( ADistIsLess, ADistIsZero ); - XMVECTOR ADistIsGreaterEqual = XMVectorOrInt( ADistIsGreater, ADistIsZero ); - - XMVECTOR AA0, AA1, AA2; - bool bPositiveA; - - if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select0111 ) ) || - DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select0111 ) ) ) - { - // A0 is singular, crossing from positive to negative. - AA0 = A0; AA1 = A1; AA2 = A2; - bPositiveA = true; - } - else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select0111 ) ) || - DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select0111 ) ) ) - { - // A0 is singular, crossing from negative to positive. - AA0 = A0; AA1 = A2; AA2 = A1; - bPositiveA = false; - } - else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select1011 ) ) || - DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select1011 ) ) ) - { - // A1 is singular, crossing from positive to negative. - AA0 = A1; AA1 = A2; AA2 = A0; - bPositiveA = true; - } - else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select1011 ) ) || - DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select1011 ) ) ) - { - // A1 is singular, crossing from negative to positive. - AA0 = A1; AA1 = A0; AA2 = A2; - bPositiveA = false; - } - else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select1101 ) ) || - DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select1101 ) ) ) - { - // A2 is singular, crossing from positive to negative. - AA0 = A2; AA1 = A0; AA2 = A1; - bPositiveA = true; - } - else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select1101 ) ) || - DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select1101 ) ) ) - { - // A2 is singular, crossing from negative to positive. - AA0 = A2; AA1 = A1; AA2 = A0; - bPositiveA = false; - } - else - { - assert( false ); - return false; - } - - XMVECTOR BDistIsLessEqual = XMVectorOrInt( BDistIsLess, BDistIsZero ); - XMVECTOR BDistIsGreaterEqual = XMVectorOrInt( BDistIsGreater, BDistIsZero ); - - XMVECTOR BB0, BB1, BB2; - bool bPositiveB; - - if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select0111 ) ) || - DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select0111 ) ) ) - { - // B0 is singular, crossing from positive to negative. - BB0 = B0; BB1 = B1; BB2 = B2; - bPositiveB = true; - } - else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select0111 ) ) || - DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select0111 ) ) ) - { - // B0 is singular, crossing from negative to positive. - BB0 = B0; BB1 = B2; BB2 = B1; - bPositiveB = false; - } - else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select1011 ) ) || - DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select1011 ) ) ) - { - // B1 is singular, crossing from positive to negative. - BB0 = B1; BB1 = B2; BB2 = B0; - bPositiveB = true; - } - else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select1011 ) ) || - DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select1011 ) ) ) - { - // B1 is singular, crossing from negative to positive. - BB0 = B1; BB1 = B0; BB2 = B2; - bPositiveB = false; - } - else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select1101 ) ) || - DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select1101 ) ) ) - { - // B2 is singular, crossing from positive to negative. - BB0 = B2; BB1 = B0; BB2 = B1; - bPositiveB = true; - } - else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select1101 ) ) || - DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select1101 ) ) ) - { - // B2 is singular, crossing from negative to positive. - BB0 = B2; BB1 = B1; BB2 = B0; - bPositiveB = false; - } - else - { - assert( false ); - return false; - } - - XMVECTOR Delta0, Delta1; - - // Reverse the direction of the test depending on whether the singular vertices are - // the same sign or different signs. - if( bPositiveA ^ bPositiveB ) - { - Delta0 = ( BB0 - AA0 ); - Delta1 = ( AA0 - BB0 ); - } - else - { - Delta0 = ( AA0 - BB0 ); - Delta1 = ( BB0 - AA0 ); - } - - // Check if the triangles overlap on the line of intersection between the - // planes of the two triangles by finding the signed line distances. - XMVECTOR Dist0 = XMVector3Dot( Delta0, XMVector3Cross( ( BB2 - BB0 ), ( AA2 - AA0 ) ) ); - if( XMVector4Greater( Dist0, Zero ) ) - return false; - - XMVECTOR Dist1 = XMVector3Dot( Delta1, XMVector3Cross( ( BB1 - BB0 ), ( AA1 - AA0 ) ) ); - if( XMVector4Greater( Dist1, Zero ) ) - return false; - - return true; -} - - -//----------------------------------------------------------------------------- -// Ray-triangle test -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline PlaneIntersectionType XM_CALLCONV Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane ) -{ - XMVECTOR One = XMVectorSplatOne(); - - assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); - - // Set w of the points to one so we can dot4 with a plane. - XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One); - XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One); - XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One); - - XMVECTOR Outside, Inside; - DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane, Outside, Inside ); - - // If the triangle is outside any plane it is outside. - if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) - return FRONT; - - // If the triangle is inside all planes it is inside. - if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) - return BACK; - - // The triangle is not inside all planes or outside a plane it intersects. - return INTERSECTING; -} - - -//----------------------------------------------------------------------------- -// Test a triangle vs 6 planes (typically forming a frustum). -//----------------------------------------------------------------------------- -_Use_decl_annotations_ -inline ContainmentType XM_CALLCONV ContainedBy( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, - GXMVECTOR Plane0, HXMVECTOR Plane1, HXMVECTOR Plane2, - CXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5 ) -{ - XMVECTOR One = XMVectorSplatOne(); - - // Set w of the points to one so we can dot4 with a plane. - XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One); - XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One); - XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One); - - XMVECTOR Outside, Inside; - - // Test against each plane. - DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane0, Outside, Inside ); - - XMVECTOR AnyOutside = Outside; - XMVECTOR AllInside = Inside; - - DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane1, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane2, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane3, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane4, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane5, Outside, Inside ); - AnyOutside = XMVectorOrInt( AnyOutside, Outside ); - AllInside = XMVectorAndInt( AllInside, Inside ); - - // If the triangle is outside any plane it is outside. - if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) - return DISJOINT; - - // If the triangle is inside all planes it is inside. - if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) - return CONTAINS; - - // The triangle is not inside all planes or outside a plane, it may intersect. - return INTERSECTS; -} - -}; // namespace TriangleTests - +//------------------------------------------------------------------------------------- +// DirectXCollision.inl -- C++ Collision Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +XMGLOBALCONST XMVECTORF32 g_BoxOffset[8] = +{ + { -1.0f, -1.0f, 1.0f, 0.0f }, + { 1.0f, -1.0f, 1.0f, 0.0f }, + { 1.0f, 1.0f, 1.0f, 0.0f }, + { -1.0f, 1.0f, 1.0f, 0.0f }, + { -1.0f, -1.0f, -1.0f, 0.0f }, + { 1.0f, -1.0f, -1.0f, 0.0f }, + { 1.0f, 1.0f, -1.0f, 0.0f }, + { -1.0f, 1.0f, -1.0f, 0.0f }, +}; + +XMGLOBALCONST XMVECTORF32 g_RayEpsilon = { 1e-20f, 1e-20f, 1e-20f, 1e-20f }; +XMGLOBALCONST XMVECTORF32 g_RayNegEpsilon = { -1e-20f, -1e-20f, -1e-20f, -1e-20f }; +XMGLOBALCONST XMVECTORF32 g_FltMin = { -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX }; +XMGLOBALCONST XMVECTORF32 g_FltMax = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX }; + +namespace Internal +{ + +//----------------------------------------------------------------------------- +// Return true if any of the elements of a 3 vector are equal to 0xffffffff. +// Slightly more efficient than using XMVector3EqualInt. +//----------------------------------------------------------------------------- +inline bool XMVector3AnyTrue( _In_ FXMVECTOR V ) +{ + // Duplicate the fourth element from the first element. + XMVECTOR C = XMVectorSwizzle(V); + + return XMComparisonAnyTrue( XMVector4EqualIntR( C, XMVectorTrueInt() ) ); +} + + +//----------------------------------------------------------------------------- +// Return true if all of the elements of a 3 vector are equal to 0xffffffff. +// Slightly more efficient than using XMVector3EqualInt. +//----------------------------------------------------------------------------- +inline bool XMVector3AllTrue( _In_ FXMVECTOR V ) +{ + // Duplicate the fourth element from the first element. + XMVECTOR C = XMVectorSwizzle( V ); + + return XMComparisonAllTrue( XMVector4EqualIntR( C, XMVectorTrueInt() ) ); +} + +#if defined(_PREFAST) || !defined(NDEBUG) + +XMGLOBALCONST XMVECTORF32 g_UnitVectorEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f }; +XMGLOBALCONST XMVECTORF32 g_UnitQuaternionEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f }; +XMGLOBALCONST XMVECTORF32 g_UnitPlaneEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f }; + +//----------------------------------------------------------------------------- +// Return true if the vector is a unit vector (length == 1). +//----------------------------------------------------------------------------- +inline bool XMVector3IsUnit( _In_ FXMVECTOR V ) +{ + XMVECTOR Difference = XMVector3Length( V ) - XMVectorSplatOne(); + return XMVector4Less( XMVectorAbs( Difference ), g_UnitVectorEpsilon ); +} + +//----------------------------------------------------------------------------- +// Return true if the quaterion is a unit quaternion. +//----------------------------------------------------------------------------- +inline bool XMQuaternionIsUnit( _In_ FXMVECTOR Q ) +{ + XMVECTOR Difference = XMVector4Length( Q ) - XMVectorSplatOne(); + return XMVector4Less( XMVectorAbs( Difference ), g_UnitQuaternionEpsilon ); +} + +//----------------------------------------------------------------------------- +// Return true if the plane is a unit plane. +//----------------------------------------------------------------------------- +inline bool XMPlaneIsUnit( _In_ FXMVECTOR Plane ) +{ + XMVECTOR Difference = XMVector3Length( Plane ) - XMVectorSplatOne(); + return XMVector4Less( XMVectorAbs( Difference ), g_UnitPlaneEpsilon ); +} + +#endif // __PREFAST__ || !NDEBUG + +//----------------------------------------------------------------------------- +inline XMVECTOR XMPlaneTransform( _In_ FXMVECTOR Plane, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) +{ + XMVECTOR vNormal = XMVector3Rotate( Plane, Rotation ); + XMVECTOR vD = XMVectorSplatW( Plane ) - XMVector3Dot( vNormal, Translation ); + + return XMVectorInsert<0, 0, 0, 0, 1>( vNormal, vD ); +} + +//----------------------------------------------------------------------------- +// Return the point on the line segement (S1, S2) nearest the point P. +//----------------------------------------------------------------------------- +inline XMVECTOR PointOnLineSegmentNearestPoint( _In_ FXMVECTOR S1, _In_ FXMVECTOR S2, _In_ FXMVECTOR P ) +{ + XMVECTOR Dir = S2 - S1; + XMVECTOR Projection = ( XMVector3Dot( P, Dir ) - XMVector3Dot( S1, Dir ) ); + XMVECTOR LengthSq = XMVector3Dot( Dir, Dir ); + + XMVECTOR t = Projection * XMVectorReciprocal( LengthSq ); + XMVECTOR Point = S1 + t * Dir; + + // t < 0 + XMVECTOR SelectS1 = XMVectorLess( Projection, XMVectorZero() ); + Point = XMVectorSelect( Point, S1, SelectS1 ); + + // t > 1 + XMVECTOR SelectS2 = XMVectorGreater( Projection, LengthSq ); + Point = XMVectorSelect( Point, S2, SelectS2 ); + + return Point; +} + +//----------------------------------------------------------------------------- +// Test if the point (P) on the plane of the triangle is inside the triangle +// (V0, V1, V2). +//----------------------------------------------------------------------------- +inline XMVECTOR XM_CALLCONV PointOnPlaneInsideTriangle( _In_ FXMVECTOR P, _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ GXMVECTOR V2 ) +{ + // Compute the triangle normal. + XMVECTOR N = XMVector3Cross( V2 - V0, V1 - V0 ); + + // Compute the cross products of the vector from the base of each edge to + // the point with each edge vector. + XMVECTOR C0 = XMVector3Cross( P - V0, V1 - V0 ); + XMVECTOR C1 = XMVector3Cross( P - V1, V2 - V1 ); + XMVECTOR C2 = XMVector3Cross( P - V2, V0 - V2 ); + + // If the cross product points in the same direction as the normal the the + // point is inside the edge (it is zero if is on the edge). + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Inside0 = XMVectorGreaterOrEqual( XMVector3Dot( C0, N ), Zero ); + XMVECTOR Inside1 = XMVectorGreaterOrEqual( XMVector3Dot( C1, N ), Zero ); + XMVECTOR Inside2 = XMVectorGreaterOrEqual( XMVector3Dot( C2, N ), Zero ); + + // If the point inside all of the edges it is inside. + return XMVectorAndInt( XMVectorAndInt( Inside0, Inside1 ), Inside2 ); +} + +//----------------------------------------------------------------------------- +inline bool SolveCubic( _In_ float e, _In_ float f, _In_ float g, _Out_ float* t, _Out_ float* u, _Out_ float* v ) +{ + float p, q, h, rc, d, theta, costh3, sinth3; + + p = f - e * e / 3.0f; + q = g - e * f / 3.0f + e * e * e * 2.0f / 27.0f; + h = q * q / 4.0f + p * p * p / 27.0f; + + if( h > 0.0 ) + { + *t = *u = *v = 0.f; + return false; // only one real root + } + + if( ( h == 0.0 ) && ( q == 0.0 ) ) // all the same root + { + *t = - e / 3; + *u = - e / 3; + *v = - e / 3; + + return true; + } + + d = sqrtf( q * q / 4.0f - h ); + if( d < 0 ) + rc = -powf( -d, 1.0f / 3.0f ); + else + rc = powf( d, 1.0f / 3.0f ); + + theta = XMScalarACos( -q / ( 2.0f * d ) ); + costh3 = XMScalarCos( theta / 3.0f ); + sinth3 = sqrtf( 3.0f ) * XMScalarSin( theta / 3.0f ); + *t = 2.0f * rc * costh3 - e / 3.0f; + *u = -rc * ( costh3 + sinth3 ) - e / 3.0f; + *v = -rc * ( costh3 - sinth3 ) - e / 3.0f; + + return true; +} + +//----------------------------------------------------------------------------- +inline XMVECTOR CalculateEigenVector( _In_ float m11, _In_ float m12, _In_ float m13, + _In_ float m22, _In_ float m23, _In_ float m33, _In_ float e ) +{ + float fTmp[3]; + fTmp[0] = ( float )( m12 * m23 - m13 * ( m22 - e ) ); + fTmp[1] = ( float )( m13 * m12 - m23 * ( m11 - e ) ); + fTmp[2] = ( float )( ( m11 - e ) * ( m22 - e ) - m12 * m12 ); + + XMVECTOR vTmp = XMLoadFloat3( (XMFLOAT3*)fTmp ); + + if( XMVector3Equal( vTmp, XMVectorZero() ) ) // planar or linear + { + float f1, f2, f3; + + // we only have one equation - find a valid one + if( ( m11 - e != 0.0 ) || ( m12 != 0.0 ) || ( m13 != 0.0 ) ) + { + f1 = m11 - e; f2 = m12; f3 = m13; + } + else if( ( m12 != 0.0 ) || ( m22 - e != 0.0 ) || ( m23 != 0.0 ) ) + { + f1 = m12; f2 = m22 - e; f3 = m23; + } + else if( ( m13 != 0.0 ) || ( m23 != 0.0 ) || ( m33 - e != 0.0 ) ) + { + f1 = m13; f2 = m23; f3 = m33 - e; + } + else + { + // error, we'll just make something up - we have NO context + f1 = 1.0; f2 = 0.0; f3 = 0.0; + } + + if( f1 == 0.0 ) + vTmp = XMVectorSetX( vTmp, 0.0f ); + else + vTmp = XMVectorSetX( vTmp, 1.0f ); + + if( f2 == 0.0 ) + vTmp = XMVectorSetY( vTmp, 0.0f ); + else + vTmp = XMVectorSetY( vTmp, 1.0f ); + + if( f3 == 0.0 ) + { + vTmp = XMVectorSetZ( vTmp, 0.0f ); + // recalculate y to make equation work + if( m12 != 0.0 ) + vTmp = XMVectorSetY( vTmp, ( float )( -f1 / f2 ) ); + } + else + { + vTmp = XMVectorSetZ( vTmp, ( float )( ( f2 - f1 ) / f3 ) ); + } + } + + if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) > 1e-5f ) + { + return XMVector3Normalize( vTmp ); + } + else + { + // Multiply by a value large enough to make the vector non-zero. + vTmp *= 1e5f; + return XMVector3Normalize( vTmp ); + } +} + +//----------------------------------------------------------------------------- +inline bool CalculateEigenVectors( _In_ float m11, _In_ float m12, _In_ float m13, + _In_ float m22, _In_ float m23, _In_ float m33, + _In_ float e1, _In_ float e2, _In_ float e3, + _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3 ) +{ + *pV1 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e1 ); + *pV2 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e2 ); + *pV3 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e3 ); + + bool v1z = false; + bool v2z = false; + bool v3z = false; + + XMVECTOR Zero = XMVectorZero(); + + if ( XMVector3Equal( *pV1, Zero ) ) + v1z = true; + + if ( XMVector3Equal( *pV2, Zero ) ) + v2z = true; + + if ( XMVector3Equal( *pV3, Zero )) + v3z = true; + + bool e12 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV1, *pV2 ) ) ) > 0.1f ); // check for non-orthogonal vectors + bool e13 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV1, *pV3 ) ) ) > 0.1f ); + bool e23 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV2, *pV3 ) ) ) > 0.1f ); + + if( ( v1z && v2z && v3z ) || ( e12 && e13 && e23 ) || + ( e12 && v3z ) || ( e13 && v2z ) || ( e23 && v1z ) ) // all eigenvectors are 0- any basis set + { + *pV1 = g_XMIdentityR0.v; + *pV2 = g_XMIdentityR1.v; + *pV3 = g_XMIdentityR2.v; + return true; + } + + if( v1z && v2z ) + { + XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV3 ); + if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f ) + { + vTmp = XMVector3Cross( g_XMIdentityR0, *pV3 ); + } + *pV1 = XMVector3Normalize( vTmp ); + *pV2 = XMVector3Cross( *pV3, *pV1 ); + return true; + } + + if( v3z && v1z ) + { + XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV2 ); + if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f ) + { + vTmp = XMVector3Cross( g_XMIdentityR0, *pV2 ); + } + *pV3 = XMVector3Normalize( vTmp ); + *pV1 = XMVector3Cross( *pV2, *pV3 ); + return true; + } + + if( v2z && v3z ) + { + XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV1 ); + if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f ) + { + vTmp = XMVector3Cross( g_XMIdentityR0, *pV1 ); + } + *pV2 = XMVector3Normalize( vTmp ); + *pV3 = XMVector3Cross( *pV1, *pV2 ); + return true; + } + + if( ( v1z ) || e12 ) + { + *pV1 = XMVector3Cross( *pV2, *pV3 ); + return true; + } + + if( ( v2z ) || e23 ) + { + *pV2 = XMVector3Cross( *pV3, *pV1 ); + return true; + } + + if( ( v3z ) || e13 ) + { + *pV3 = XMVector3Cross( *pV1, *pV2 ); + return true; + } + + return true; +} + +//----------------------------------------------------------------------------- +inline bool CalculateEigenVectorsFromCovarianceMatrix( _In_ float Cxx, _In_ float Cyy, _In_ float Czz, + _In_ float Cxy, _In_ float Cxz, _In_ float Cyz, + _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3 ) +{ + // Calculate the eigenvalues by solving a cubic equation. + float e = -( Cxx + Cyy + Czz ); + float f = Cxx * Cyy + Cyy * Czz + Czz * Cxx - Cxy * Cxy - Cxz * Cxz - Cyz * Cyz; + float g = Cxy * Cxy * Czz + Cxz * Cxz * Cyy + Cyz * Cyz * Cxx - Cxy * Cyz * Cxz * 2.0f - Cxx * Cyy * Czz; + + float ev1, ev2, ev3; + if( !DirectX::Internal::SolveCubic( e, f, g, &ev1, &ev2, &ev3 ) ) + { + // set them to arbitrary orthonormal basis set + *pV1 = g_XMIdentityR0.v; + *pV2 = g_XMIdentityR1.v; + *pV3 = g_XMIdentityR2.v; + return false; + } + + return DirectX::Internal::CalculateEigenVectors( Cxx, Cxy, Cxz, Cyy, Cyz, Czz, ev1, ev2, ev3, pV1, pV2, pV3 ); +} + +//----------------------------------------------------------------------------- +inline void XM_CALLCONV FastIntersectTrianglePlane( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane, + XMVECTOR& Outside, XMVECTOR& Inside ) +{ + // Plane0 + XMVECTOR Dist0 = XMVector4Dot( V0, Plane ); + XMVECTOR Dist1 = XMVector4Dot( V1, Plane ); + XMVECTOR Dist2 = XMVector4Dot( V2, Plane ); + + XMVECTOR MinDist = XMVectorMin( Dist0, Dist1 ); + MinDist = XMVectorMin( MinDist, Dist2 ); + + XMVECTOR MaxDist = XMVectorMax( Dist0, Dist1 ); + MaxDist = XMVectorMax( MaxDist, Dist2 ); + + XMVECTOR Zero = XMVectorZero(); + + // Outside the plane? + Outside = XMVectorGreater( MinDist, Zero ); + + // Fully inside the plane? + Inside = XMVectorLess( MaxDist, Zero ); +} + +//----------------------------------------------------------------------------- +inline void FastIntersectSpherePlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Radius, _In_ FXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) +{ + XMVECTOR Dist = XMVector4Dot( Center, Plane ); + + // Outside the plane? + Outside = XMVectorGreater( Dist, Radius ); + + // Fully inside the plane? + Inside = XMVectorLess( Dist, -Radius ); +} + +//----------------------------------------------------------------------------- +inline void FastIntersectAxisAlignedBoxPlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) +{ + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot( Center, Plane ); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. In this case b(i) = [(1,0,0), (0,1,0), (0,0,1)]. + XMVECTOR Radius = XMVector3Dot( Extents, XMVectorAbs( Plane ) ); + + // Outside the plane? + Outside = XMVectorGreater( Dist, Radius ); + + // Fully inside the plane? + Inside = XMVectorLess( Dist, -Radius ); +} + +//----------------------------------------------------------------------------- +inline void XM_CALLCONV FastIntersectOrientedBoxPlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Axis0, _In_ GXMVECTOR Axis1, + _In_ HXMVECTOR Axis2, _In_ HXMVECTOR Plane, _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) +{ + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot( Center, Plane ); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. + XMVECTOR Radius = XMVector3Dot( Plane, Axis0 ); + Radius = XMVectorInsert<0, 0, 1, 0, 0>( Radius, XMVector3Dot( Plane, Axis1 ) ); + Radius = XMVectorInsert<0, 0, 0, 1, 0>( Radius, XMVector3Dot( Plane, Axis2 ) ); + Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) ); + + // Outside the plane? + Outside = XMVectorGreater( Dist, Radius ); + + // Fully inside the plane? + Inside = XMVectorLess( Dist, -Radius ); +} + +//----------------------------------------------------------------------------- +inline void XM_CALLCONV FastIntersectFrustumPlane( _In_ FXMVECTOR Point0, _In_ FXMVECTOR Point1, _In_ FXMVECTOR Point2, _In_ GXMVECTOR Point3, + _In_ HXMVECTOR Point4, _In_ HXMVECTOR Point5, _In_ CXMVECTOR Point6, _In_ CXMVECTOR Point7, + _In_ CXMVECTOR Plane, _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) +{ + // Find the min/max projection of the frustum onto the plane normal. + XMVECTOR Min, Max, Dist; + + Min = Max = XMVector3Dot( Plane, Point0 ); + + Dist = XMVector3Dot( Plane, Point1 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point2 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point3 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point4 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point5 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point6 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point7 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + XMVECTOR PlaneDist = -XMVectorSplatW( Plane ); + + // Outside the plane? + Outside = XMVectorGreater( Min, PlaneDist ); + + // Fully inside the plane? + Inside = XMVectorLess( Max, PlaneDist ); +} + +}; // namespace Internal + + +/**************************************************************************** + * + * BoundingSphere + * + ****************************************************************************/ + +//----------------------------------------------------------------------------- +// Transform a sphere by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingSphere::Transform( BoundingSphere& Out, FXMMATRIX M ) const +{ + // Load the center of the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + + // Transform the center of the sphere. + XMVECTOR C = XMVector3Transform( vCenter, M ); + + XMVECTOR dX = XMVector3Dot( M.r[0], M.r[0] ); + XMVECTOR dY = XMVector3Dot( M.r[1], M.r[1] ); + XMVECTOR dZ = XMVector3Dot( M.r[2], M.r[2] ); + + XMVECTOR d = XMVectorMax( dX, XMVectorMax( dY, dZ ) ); + + // Store the center sphere. + XMStoreFloat3( &Out.Center, C ); + + // Scale the radius of the pshere. + float Scale = sqrtf( XMVectorGetX(d) ); + Out.Radius = Radius * Scale; +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingSphere::Transform( BoundingSphere& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const +{ + // Load the center of the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + + // Transform the center of the sphere. + vCenter = XMVector3Rotate( vCenter * XMVectorReplicate( Scale ), Rotation ) + Translation; + + // Store the center sphere. + XMStoreFloat3( &Out.Center, vCenter ); + + // Scale the radius of the pshere. + Out.Radius = Radius * Scale; +} + + +//----------------------------------------------------------------------------- +// Point in sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingSphere::Contains( FXMVECTOR Point ) const +{ + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + XMVECTOR DistanceSquared = XMVector3LengthSq( Point - vCenter ); + XMVECTOR RadiusSquared = XMVectorMultiply( vRadius, vRadius ); + + return XMVector3LessOrEqual( DistanceSquared, RadiusSquared ) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingSphere::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + if ( !Intersects(V0,V1,V2) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + XMVECTOR RadiusSquared = XMVectorMultiply( vRadius, vRadius ); + + XMVECTOR DistanceSquared = XMVector3LengthSq( V0 - vCenter ); + XMVECTOR Inside = XMVectorLessOrEqual(DistanceSquared, RadiusSquared); + + DistanceSquared = XMVector3LengthSq( V1 - vCenter ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared) ); + + DistanceSquared = XMVector3LengthSq( V2 - vCenter ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared) ); + + return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere in sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( const BoundingSphere& sh ) const +{ + XMVECTOR Center1 = XMLoadFloat3( &Center ); + float r1 = Radius; + + XMVECTOR Center2 = XMLoadFloat3( &sh.Center ); + float r2 = sh.Radius; + + XMVECTOR V = XMVectorSubtract( Center2, Center1 ); + + XMVECTOR Dist = XMVector3Length( V ); + + float d = XMVectorGetX( Dist ); + + return (r1 + r2 >= d) ? ((r1 - r2 >= d) ? CONTAINS : INTERSECTS) : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( const BoundingBox& box ) const +{ + if ( !box.Intersects(*this) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + XMVECTOR RadiusSq = vRadius * vRadius; + + XMVECTOR boxCenter = XMLoadFloat3( &box.Center ); + XMVECTOR boxExtents = XMLoadFloat3( &box.Extents ); + + XMVECTOR InsideAll = XMVectorTrueInt(); + + XMVECTOR offset = boxCenter - vCenter; + + for( size_t i = 0; i < BoundingBox::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVectorMultiplyAdd( boxExtents, g_BoxOffset[i], offset ); + XMVECTOR d = XMVector3LengthSq( C ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) ); + } + + return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Oriented box in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( const BoundingOrientedBox& box ) const +{ + if ( !box.Intersects(*this) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + XMVECTOR RadiusSq = vRadius * vRadius; + + XMVECTOR boxCenter = XMLoadFloat3( &box.Center ); + XMVECTOR boxExtents = XMLoadFloat3( &box.Extents ); + XMVECTOR boxOrientation = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( boxOrientation ) ); + + XMVECTOR InsideAll = XMVectorTrueInt(); + + for( size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3Rotate( boxExtents * g_BoxOffset[i], boxOrientation ) + boxCenter; + XMVECTOR d = XMVector3LengthSq( XMVectorSubtract( vCenter, C ) ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) ); + } + + return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; + +} + + +//----------------------------------------------------------------------------- +// Frustum in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( const BoundingFrustum& fr ) const +{ + if ( !fr.Intersects(*this) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + XMVECTOR RadiusSq = vRadius * vRadius; + + XMVECTOR vOrigin = XMLoadFloat3( &fr.Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &fr.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( fr.RightSlope, fr.TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &fr.Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &fr.Far ); + + XMVECTOR Corners[BoundingFrustum::CORNER_COUNT]; + Corners[0] = vRightTop * vNear; + Corners[1] = vRightBottom * vNear; + Corners[2] = vLeftTop * vNear; + Corners[3] = vLeftBottom * vNear; + Corners[4] = vRightTop * vFar; + Corners[5] = vRightBottom * vFar; + Corners[6] = vLeftTop * vFar; + Corners[7] = vLeftBottom * vFar; + + XMVECTOR InsideAll = XMVectorTrueInt(); + for( size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3Rotate( Corners[i], vOrientation ) + vOrigin; + XMVECTOR d = XMVector3LengthSq( XMVectorSubtract( vCenter, C ) ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) ); + } + + return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( const BoundingSphere& sh ) const +{ + // Load A. + XMVECTOR vCenterA = XMLoadFloat3( &Center ); + XMVECTOR vRadiusA = XMVectorReplicatePtr( &Radius ); + + // Load B. + XMVECTOR vCenterB = XMLoadFloat3( &sh.Center ); + XMVECTOR vRadiusB = XMVectorReplicatePtr( &sh.Radius ); + + // Distance squared between centers. + XMVECTOR Delta = vCenterB - vCenterA; + XMVECTOR DistanceSquared = XMVector3LengthSq( Delta ); + + // Sum of the radii squared. + XMVECTOR RadiusSquared = XMVectorAdd( vRadiusA, vRadiusB ); + RadiusSquared = XMVectorMultiply( RadiusSquared, RadiusSquared ); + + return XMVector3LessOrEqual( DistanceSquared, RadiusSquared ); +} + + +//----------------------------------------------------------------------------- +// Box vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( const BoundingBox& box ) const +{ + return box.Intersects( *this ); +} + +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( const BoundingOrientedBox& box ) const +{ + return box.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Frustum vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( const BoundingFrustum& fr ) const +{ + return fr.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Triangle vs sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + // Compute the plane of the triangle (has to be normalized). + XMVECTOR N = XMVector3Normalize( XMVector3Cross( V1 - V0, V2 - V0 ) ); + + // Assert that the triangle is not degenerate. + assert( !XMVector3Equal( N, XMVectorZero() ) ); + + // Find the nearest feature on the triangle to the sphere. + XMVECTOR Dist = XMVector3Dot( vCenter - V0, N ); + + // If the center of the sphere is farther from the plane of the triangle than + // the radius of the sphere, then there cannot be an intersection. + XMVECTOR NoIntersection = XMVectorLess( Dist, -vRadius ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Dist, vRadius ) ); + + // Project the center of the sphere onto the plane of the triangle. + XMVECTOR Point = vCenter - ( N * Dist ); + + // Is it inside all the edges? If so we intersect because the distance + // to the plane is less than the radius. + XMVECTOR Intersection = DirectX::Internal::PointOnPlaneInsideTriangle( Point, V0, V1, V2 ); + + // Find the nearest point on each edge. + XMVECTOR RadiusSq = vRadius * vRadius; + + // Edge 0,1 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V0, V1, vCenter ); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) ); + + // Edge 1,2 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V1, V2, vCenter ); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) ); + + // Edge 2,0 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V2, V0, vCenter ); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) ); + + return XMVector4EqualInt( XMVectorAndCInt( Intersection, NoIntersection ), XMVectorTrueInt() ); +} + + +//----------------------------------------------------------------------------- +// Sphere-plane intersection +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR Plane ) const +{ + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane, Outside, Inside ); + + // If the sphere is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the sphere is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The sphere is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with a sphere. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingSphere::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const +{ + assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + // l is the vector from the ray origin to the center of the sphere. + XMVECTOR l = vCenter - Origin; + + // s is the projection of the l onto the ray direction. + XMVECTOR s = XMVector3Dot( l, Direction ); + + XMVECTOR l2 = XMVector3Dot( l, l ); + + XMVECTOR r2 = vRadius * vRadius; + + // m2 is squared distance from the center of the sphere to the projection. + XMVECTOR m2 = l2 - s * s; + + XMVECTOR NoIntersection; + + // If the ray origin is outside the sphere and the center of the sphere is + // behind the ray origin there is no intersection. + NoIntersection = XMVectorAndInt( XMVectorLess( s, XMVectorZero() ), XMVectorGreater( l2, r2 ) ); + + // If the squared distance from the center of the sphere to the projection + // is greater than the radius squared the ray will miss the sphere. + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( m2, r2 ) ); + + // The ray hits the sphere, compute the nearest intersection point. + XMVECTOR q = XMVectorSqrt( r2 - m2 ); + XMVECTOR t1 = s - q; + XMVECTOR t2 = s + q; + + XMVECTOR OriginInside = XMVectorLessOrEqual( l2, r2 ); + XMVECTOR t = XMVectorSelect( t1, t2, OriginInside ); + + if( XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ) ) + { + // Store the x-component to *pDist. + XMStoreFloat( &Dist, t ); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test a sphere vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingSphere::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const +{ + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane1, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane2, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane3, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane4, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane5, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the sphere is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the sphere is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The sphere is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Creates a bounding sphere that contains two other bounding spheres +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateMerged( BoundingSphere& Out, const BoundingSphere& S1, const BoundingSphere& S2 ) +{ + XMVECTOR Center1 = XMLoadFloat3( &S1.Center ); + float r1 = S1.Radius; + + XMVECTOR Center2 = XMLoadFloat3( &S2.Center ); + float r2 = S2.Radius; + + XMVECTOR V = XMVectorSubtract( Center2, Center1 ); + + XMVECTOR Dist = XMVector3Length( V ); + + float d = XMVectorGetX(Dist); + + if ( r1 + r2 >= d ) + { + if ( r1 - r2 >= d ) + { + Out = S1; + return; + } + else if ( r2 - r1 >= d ) + { + Out = S2; + return; + } + } + + XMVECTOR N = XMVectorDivide( V, Dist ); + + float t1 = XMMin( -r1, d-r2 ); + float t2 = XMMax( r1, d+r2 ); + float t_5 = (t2 - t1) * 0.5f; + + XMVECTOR NCenter = XMVectorAdd( Center1, XMVectorMultiply( N, XMVectorReplicate( t_5 + t1 ) ) ); + + XMStoreFloat3( &Out.Center, NCenter ); + Out.Radius = t_5; +} + + +//----------------------------------------------------------------------------- +// Create sphere enscribing bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromBoundingBox( BoundingSphere& Out, const BoundingBox& box ) +{ + Out.Center = box.Center; + XMVECTOR vExtents = XMLoadFloat3( &box.Extents ); + Out.Radius = XMVectorGetX( XMVector3Length( vExtents ) ); +} + +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromBoundingBox( BoundingSphere& Out, const BoundingOrientedBox& box ) +{ + // Bounding box orientation is irrelevant because a sphere is rotationally invariant + Out.Center = box.Center; + XMVECTOR vExtents = XMLoadFloat3( &box.Extents ); + Out.Radius = XMVectorGetX( XMVector3Length( vExtents ) ); +} + + +//----------------------------------------------------------------------------- +// Find the approximate smallest enclosing bounding sphere for a set of +// points. Exact computation of the smallest enclosing bounding sphere is +// possible but is slower and requires a more complex algorithm. +// The algorithm is based on Jack Ritter, "An Efficient Bounding Sphere", +// Graphics Gems. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromPoints( BoundingSphere& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride ) +{ + assert( Count > 0 ); + assert( pPoints ); + + // Find the points with minimum and maximum x, y, and z + XMVECTOR MinX, MaxX, MinY, MaxY, MinZ, MaxZ; + + MinX = MaxX = MinY = MaxY = MinZ = MaxZ = XMLoadFloat3( pPoints ); + + for( size_t i = 1; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ); + + float px = XMVectorGetX( Point ); + float py = XMVectorGetY( Point ); + float pz = XMVectorGetZ( Point ); + + if( px < XMVectorGetX( MinX ) ) + MinX = Point; + + if( px > XMVectorGetX( MaxX ) ) + MaxX = Point; + + if( py < XMVectorGetY( MinY ) ) + MinY = Point; + + if( py > XMVectorGetY( MaxY ) ) + MaxY = Point; + + if( pz < XMVectorGetZ( MinZ ) ) + MinZ = Point; + + if( pz > XMVectorGetZ( MaxZ ) ) + MaxZ = Point; + } + + // Use the min/max pair that are farthest apart to form the initial sphere. + XMVECTOR DeltaX = MaxX - MinX; + XMVECTOR DistX = XMVector3Length( DeltaX ); + + XMVECTOR DeltaY = MaxY - MinY; + XMVECTOR DistY = XMVector3Length( DeltaY ); + + XMVECTOR DeltaZ = MaxZ - MinZ; + XMVECTOR DistZ = XMVector3Length( DeltaZ ); + + XMVECTOR vCenter; + XMVECTOR vRadius; + + if( XMVector3Greater( DistX, DistY ) ) + { + if( XMVector3Greater( DistX, DistZ ) ) + { + // Use min/max x. + vCenter = XMVectorLerp(MaxX,MinX,0.5f); + vRadius = DistX * 0.5f; + } + else + { + // Use min/max z. + vCenter = XMVectorLerp(MaxZ,MinZ,0.5f); + vRadius = DistZ * 0.5f; + } + } + else // Y >= X + { + if( XMVector3Greater( DistY, DistZ ) ) + { + // Use min/max y. + vCenter = XMVectorLerp(MaxY,MinY,0.5f); + vRadius = DistY * 0.5f; + } + else + { + // Use min/max z. + vCenter = XMVectorLerp(MaxZ,MinZ,0.5f); + vRadius = DistZ * 0.5f; + } + } + + // Add any points not inside the sphere. + for( size_t i = 0; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ); + + XMVECTOR Delta = Point - vCenter; + + XMVECTOR Dist = XMVector3Length( Delta ); + + if( XMVector3Greater( Dist, vRadius ) ) + { + // Adjust sphere to include the new point. + vRadius = ( vRadius + Dist ) * 0.5f; + vCenter += ( XMVectorReplicate( 1.0f ) - XMVectorDivide(vRadius,Dist) ) * Delta; + } + } + + XMStoreFloat3( &Out.Center, vCenter ); + XMStoreFloat( &Out.Radius, vRadius ); +} + + +//----------------------------------------------------------------------------- +// Create sphere containing frustum +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromFrustum( BoundingSphere& Out, const BoundingFrustum& fr ) +{ + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners( Corners ); + CreateFromPoints( Out, BoundingFrustum::CORNER_COUNT, Corners, sizeof(XMFLOAT3) ); +} + + +/**************************************************************************** + * + * BoundingBox + * + ****************************************************************************/ + +//----------------------------------------------------------------------------- +// Transform an axis aligned box by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingBox::Transform( BoundingBox& Out, FXMMATRIX M ) const +{ + // Load center and extents. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Compute and transform the corners and find new min/max bounds. + XMVECTOR Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[0], vCenter ); + Corner = XMVector3Transform( Corner, M ); + + XMVECTOR Min, Max; + Min = Max = Corner; + + for( size_t i = 1; i < CORNER_COUNT; ++i ) + { + Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter ); + Corner = XMVector3Transform( Corner, M ); + + Min = XMVectorMin( Min, Corner ); + Max = XMVectorMax( Max, Corner ); + } + + // Store center and extents. + XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); + XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingBox::Transform( BoundingBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const +{ + assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) ); + + // Load center and extents. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + XMVECTOR VectorScale = XMVectorReplicate( Scale ); + + // Compute and transform the corners and find new min/max bounds. + XMVECTOR Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[0], vCenter ); + Corner = XMVector3Rotate( Corner * VectorScale, Rotation ) + Translation; + + XMVECTOR Min, Max; + Min = Max = Corner; + + for( size_t i = 1; i < CORNER_COUNT; ++i ) + { + Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter ); + Corner = XMVector3Rotate( Corner * VectorScale, Rotation ) + Translation; + + Min = XMVectorMin( Min, Corner ); + Max = XMVectorMax( Max, Corner ); + } + + // Store center and extents. + XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); + XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::GetCorners( XMFLOAT3* Corners ) const +{ + assert( Corners != nullptr ); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + for( size_t i = 0; i < CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter ); + XMStoreFloat3( &Corners[i], C ); + } +} + + +//----------------------------------------------------------------------------- +// Point in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingBox::Contains( FXMVECTOR Point ) const +{ + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + return XMVector3InBounds( Point - vCenter, vExtents ) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingBox::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + if ( !Intersects(V0,V1,V2) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + XMVECTOR d = XMVectorAbs( V0 - vCenter ); + XMVECTOR Inside = XMVectorLessOrEqual( d, vExtents ); + + d = XMVectorAbs( V1 - vCenter ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); + + d = XMVectorAbs( V2 - vCenter ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); + + return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( const BoundingSphere& sh ) const +{ + XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR BoxCenter = XMLoadFloat3( &Center ); + XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); + + XMVECTOR BoxMin = BoxCenter - BoxExtents; + XMVECTOR BoxMax = BoxCenter + BoxExtents; + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess( SphereCenter, BoxMin ); + XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxMax ); + + XMVECTOR MinDelta = SphereCenter - BoxMin; + XMVECTOR MaxDelta = SphereCenter - BoxMax; + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect( d, MinDelta, LessThanMin ); + d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot( d, d ); + + if ( XMVector3Greater( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ) ) + return DISJOINT; + + XMVECTOR InsideAll = XMVectorLessOrEqual( BoxMin + SphereRadius, SphereCenter ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( SphereCenter, BoxMax - SphereRadius ) ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorGreater( BoxMax - BoxMin, SphereRadius ) ); + + return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( const BoundingBox& box ) const +{ + XMVECTOR CenterA = XMLoadFloat3( &Center ); + XMVECTOR ExtentsA = XMLoadFloat3( &Extents ); + + XMVECTOR CenterB = XMLoadFloat3( &box.Center ); + XMVECTOR ExtentsB = XMLoadFloat3( &box.Extents ); + + XMVECTOR MinA = CenterA - ExtentsA; + XMVECTOR MaxA = CenterA + ExtentsA; + + XMVECTOR MinB = CenterB - ExtentsB; + XMVECTOR MaxB = CenterB + ExtentsB; + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false + XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( MinA, MaxB ), XMVectorGreater( MinB, MaxA ) ); + + if ( DirectX::Internal::XMVector3AnyTrue( Disjoint ) ) + return DISJOINT; + + // for each i in (x, y, z) if a_min(i) <= b_min(i) and b_max(i) <= a_max(i) then A contains B + XMVECTOR Inside = XMVectorAndInt( XMVectorLessOrEqual( MinA, MinB ), XMVectorLessOrEqual( MaxB, MaxA ) ); + + return DirectX::Internal::XMVector3AllTrue( Inside ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Oriented box in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( const BoundingOrientedBox& box ) const +{ + if ( !box.Intersects( *this ) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Subtract off the AABB center to remove a subtract below + XMVECTOR oCenter = XMLoadFloat3( &box.Center ) - vCenter; + + XMVECTOR oExtents = XMLoadFloat3( &box.Extents ); + XMVECTOR oOrientation = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( oOrientation ) ); + + XMVECTOR Inside = XMVectorTrueInt(); + + for( size_t i=0; i < BoundingOrientedBox::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3Rotate( oExtents * g_BoxOffset[i], oOrientation ) + oCenter; + XMVECTOR d = XMVectorAbs(C); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); + } + + return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Frustum in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( const BoundingFrustum& fr ) const +{ + if ( !fr.Intersects( *this ) ) + return DISJOINT; + + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners( Corners ); + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + XMVECTOR Inside = XMVectorTrueInt(); + + for( size_t i=0; i < BoundingFrustum::CORNER_COUNT; ++i ) + { + XMVECTOR Point = XMLoadFloat3( &Corners[i] ); + XMVECTOR d = XMVectorAbs( Point - vCenter ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); + } + + return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( const BoundingSphere& sh ) const +{ + XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR BoxCenter = XMLoadFloat3( &Center ); + XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); + + XMVECTOR BoxMin = BoxCenter - BoxExtents; + XMVECTOR BoxMax = BoxCenter + BoxExtents; + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess( SphereCenter, BoxMin ); + XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxMax ); + + XMVECTOR MinDelta = SphereCenter - BoxMin; + XMVECTOR MaxDelta = SphereCenter - BoxMax; + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect( d, MinDelta, LessThanMin ); + d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot( d, d ); + + return XMVector3LessOrEqual( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ); +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( const BoundingBox& box ) const +{ + XMVECTOR CenterA = XMLoadFloat3( &Center ); + XMVECTOR ExtentsA = XMLoadFloat3( &Extents ); + + XMVECTOR CenterB = XMLoadFloat3( &box.Center ); + XMVECTOR ExtentsB = XMLoadFloat3( &box.Extents ); + + XMVECTOR MinA = CenterA - ExtentsA; + XMVECTOR MaxA = CenterA + ExtentsA; + + XMVECTOR MinB = CenterB - ExtentsB; + XMVECTOR MaxB = CenterB + ExtentsB; + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false + XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( MinA, MaxB ), XMVectorGreater( MinB, MaxA ) ); + + return !DirectX::Internal::XMVector3AnyTrue( Disjoint ); +} + + +//----------------------------------------------------------------------------- +// Oriented box vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( const BoundingOrientedBox& box ) const +{ + return box.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Frustum vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( const BoundingFrustum& fr ) const +{ + return fr.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Triangle vs. axis aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingBox::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + XMVECTOR Zero = XMVectorZero(); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + XMVECTOR BoxMin = vCenter - vExtents; + XMVECTOR BoxMax = vCenter + vExtents; + + // Test the axes of the box (in effect test the AAB against the minimal AAB + // around the triangle). + XMVECTOR TriMin = XMVectorMin( XMVectorMin( V0, V1 ), V2 ); + XMVECTOR TriMax = XMVectorMax( XMVectorMax( V0, V1 ), V2 ); + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then disjoint + XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( TriMin, BoxMax ), XMVectorGreater( BoxMin, TriMax ) ); + if( DirectX::Internal::XMVector3AnyTrue( Disjoint ) ) + return false; + + // Test the plane of the triangle. + XMVECTOR Normal = XMVector3Cross( V1 - V0, V2 - V0 ); + XMVECTOR Dist = XMVector3Dot( Normal, V0 ); + + // Assert that the triangle is not degenerate. + assert( !XMVector3Equal( Normal, Zero ) ); + + // for each i in (x, y, z) if n(i) >= 0 then v_min(i)=b_min(i), v_max(i)=b_max(i) + // else v_min(i)=b_max(i), v_max(i)=b_min(i) + XMVECTOR NormalSelect = XMVectorGreater( Normal, Zero ); + XMVECTOR V_Min = XMVectorSelect( BoxMax, BoxMin, NormalSelect ); + XMVECTOR V_Max = XMVectorSelect( BoxMin, BoxMax, NormalSelect ); + + // if n dot v_min + d > 0 || n dot v_max + d < 0 then disjoint + XMVECTOR MinDist = XMVector3Dot( V_Min, Normal ); + XMVECTOR MaxDist = XMVector3Dot( V_Max, Normal ); + + XMVECTOR NoIntersection = XMVectorGreater( MinDist, Dist ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( MaxDist, Dist ) ); + + // Move the box center to zero to simplify the following tests. + XMVECTOR TV0 = V0 - vCenter; + XMVECTOR TV1 = V1 - vCenter; + XMVECTOR TV2 = V2 - vCenter; + + // Test the edge/edge axes (3*3). + XMVECTOR e0 = TV1 - TV0; + XMVECTOR e1 = TV2 - TV1; + XMVECTOR e2 = TV0 - TV2; + + // Make w zero. + e0 = XMVectorInsert<0, 0, 0, 0, 1>( e0, Zero ); + e1 = XMVectorInsert<0, 0, 0, 0, 1>( e1, Zero ); + e2 = XMVectorInsert<0, 0, 0, 0, 1>( e2, Zero ); + + XMVECTOR Axis; + XMVECTOR p0, p1, p2; + XMVECTOR Min, Max; + XMVECTOR Radius; + + // Axis == (1,0,0) x e0 = (0, -e0.z, e0.y) + Axis = XMVectorPermute( e0, -e0 ); + p0 = XMVector3Dot( TV0, Axis ); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot( TV2, Axis ); + Min = XMVectorMin( p0, p2 ); + Max = XMVectorMax( p0, p2 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (1,0,0) x e1 = (0, -e1.z, e1.y) + Axis = XMVectorPermute( e1, -e1 ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (1,0,0) x e2 = (0, -e2.z, e2.y) + Axis = XMVectorPermute( e2, -e2 ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (0,1,0) x e0 = (e0.z, 0, -e0.x) + Axis = XMVectorPermute( e0, -e0 ); + p0 = XMVector3Dot( TV0, Axis ); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot( TV2, Axis ); + Min = XMVectorMin( p0, p2 ); + Max = XMVectorMax( p0, p2 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (0,1,0) x e1 = (e1.z, 0, -e1.x) + Axis = XMVectorPermute( e1, -e1 ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (0,0,1) x e2 = (e2.z, 0, -e2.x) + Axis = XMVectorPermute( e2, -e2 ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (0,0,1) x e0 = (-e0.y, e0.x, 0) + Axis = XMVectorPermute( e0, -e0 ); + p0 = XMVector3Dot( TV0, Axis ); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot( TV2, Axis ); + Min = XMVectorMin( p0, p2 ); + Max = XMVectorMax( p0, p2 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (0,0,1) x e1 = (-e1.y, e1.x, 0) + Axis = XMVectorPermute( e1, -e1 ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (0,0,1) x e2 = (-e2.y, e2.x, 0) + Axis = XMVectorPermute( e2, -e2 ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + return XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingBox::Intersects( FXMVECTOR Plane ) const +{ + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane, Outside, Inside ); + + // If the box is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the box is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The box is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with an axis aligned +// box using the slabs method. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingBox::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const +{ + assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Adjust ray origin to be relative to center of the box. + XMVECTOR TOrigin = vCenter - Origin; + + // Compute the dot product againt each axis of the box. + // Since the axii are (1,0,0), (0,1,0), (0,0,1) no computation is necessary. + XMVECTOR AxisDotOrigin = TOrigin; + XMVECTOR AxisDotDirection = Direction; + + // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab. + XMVECTOR IsParallel = XMVectorLessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ); + + // Test against all three axii simultaneously. + XMVECTOR InverseAxisDotDirection = XMVectorReciprocal( AxisDotDirection ); + XMVECTOR t1 = ( AxisDotOrigin - vExtents ) * InverseAxisDotDirection; + XMVECTOR t2 = ( AxisDotOrigin + vExtents ) * InverseAxisDotDirection; + + // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't + // use the results from any directions parallel to the slab. + XMVECTOR t_min = XMVectorSelect( XMVectorMin( t1, t2 ), g_FltMin, IsParallel ); + XMVECTOR t_max = XMVectorSelect( XMVectorMax( t1, t2 ), g_FltMax, IsParallel ); + + // t_min.x = maximum( t_min.x, t_min.y, t_min.z ); + // t_max.x = minimum( t_max.x, t_max.y, t_max.z ); + t_min = XMVectorMax( t_min, XMVectorSplatY( t_min ) ); // x = max(x,y) + t_min = XMVectorMax( t_min, XMVectorSplatZ( t_min ) ); // x = max(max(x,y),z) + t_max = XMVectorMin( t_max, XMVectorSplatY( t_max ) ); // x = min(x,y) + t_max = XMVectorMin( t_max, XMVectorSplatZ( t_max ) ); // x = min(min(x,y),z) + + // if ( t_min > t_max ) return false; + XMVECTOR NoIntersection = XMVectorGreater( XMVectorSplatX( t_min ), XMVectorSplatX( t_max ) ); + + // if ( t_max < 0.0f ) return false; + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorSplatX( t_max ), XMVectorZero() ) ); + + // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false; + XMVECTOR ParallelOverlap = XMVectorInBounds( AxisDotOrigin, vExtents ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorAndCInt( IsParallel, ParallelOverlap ) ); + + if( !DirectX::Internal::XMVector3AnyTrue( NoIntersection ) ) + { + // Store the x-component to *pDist + XMStoreFloat( &Dist, t_min ); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test an axis alinged box vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingBox::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane1, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane2, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane3, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane4, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane5, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the box is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the box is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The box is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box that contains two other bounding boxes +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateMerged( BoundingBox& Out, const BoundingBox& b1, const BoundingBox& b2 ) +{ + XMVECTOR b1Center = XMLoadFloat3( &b1.Center ); + XMVECTOR b1Extents = XMLoadFloat3( &b1.Extents ); + + XMVECTOR b2Center = XMLoadFloat3( &b2.Center ); + XMVECTOR b2Extents = XMLoadFloat3( &b2.Extents ); + + XMVECTOR Min = XMVectorSubtract( b1Center, b1Extents ); + Min = XMVectorMin( Min, XMVectorSubtract( b2Center, b2Extents ) ); + + XMVECTOR Max = XMVectorAdd( b1Center, b1Extents ); + Max = XMVectorMax( Max, XMVectorAdd( b2Center, b2Extents ) ); + + assert( XMVector3LessOrEqual( Min, Max ) ); + + XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); + XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box that contains a bounding sphere +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateFromSphere( BoundingBox& Out, const BoundingSphere& sh ) +{ + XMVECTOR spCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR shRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR Min = XMVectorSubtract( spCenter, shRadius ); + XMVECTOR Max = XMVectorAdd( spCenter, shRadius ); + + assert( XMVector3LessOrEqual( Min, Max ) ); + + XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); + XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box from min/max points +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingBox::CreateFromPoints( BoundingBox& Out, FXMVECTOR pt1, FXMVECTOR pt2 ) +{ + XMVECTOR Min = XMVectorMin( pt1, pt2 ); + XMVECTOR Max = XMVectorMax( pt1, pt2 ); + + // Store center and extents. + XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); + XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); +} + + +//----------------------------------------------------------------------------- +// Find the minimum axis aligned bounding box containing a set of points. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateFromPoints( BoundingBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride ) +{ + assert( Count > 0 ); + assert( pPoints ); + + // Find the minimum and maximum x, y, and z + XMVECTOR vMin, vMax; + + vMin = vMax = XMLoadFloat3( pPoints ); + + for( size_t i = 1; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ); + + vMin = XMVectorMin( vMin, Point ); + vMax = XMVectorMax( vMax, Point ); + } + + // Store center and extents. + XMStoreFloat3( &Out.Center, ( vMin + vMax ) * 0.5f ); + XMStoreFloat3( &Out.Extents, ( vMax - vMin ) * 0.5f ); +} + + +/**************************************************************************** + * + * BoundingOrientedBox + * + ****************************************************************************/ + +//----------------------------------------------------------------------------- +// Transform an oriented box by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingOrientedBox::Transform( BoundingOrientedBox& Out, FXMMATRIX M ) const +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Composite the box rotation and the transform rotation. + XMMATRIX nM; + nM.r[0] = XMVector3Normalize( M.r[0] ); + nM.r[1] = XMVector3Normalize( M.r[1] ); + nM.r[2] = XMVector3Normalize( M.r[2] ); + nM.r[3] = g_XMIdentityR3; + XMVECTOR Rotation = XMQuaternionRotationMatrix( nM ); + vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); + + // Transform the center. + vCenter = XMVector3Transform( vCenter, M ); + + // Scale the box extents. + XMVECTOR dX = XMVector3Length( M.r[0] ); + XMVECTOR dY = XMVector3Length( M.r[1] ); + XMVECTOR dZ = XMVector3Length( M.r[2] ); + + XMVECTOR VectorScale = XMVectorSelect( dY, dX, g_XMSelect1000 ); + VectorScale = XMVectorSelect( dZ, VectorScale, g_XMSelect1100 ); + vExtents = vExtents * VectorScale; + + // Store the box. + XMStoreFloat3( &Out.Center, vCenter ); + XMStoreFloat3( &Out.Extents, vExtents ); + XMStoreFloat4( &Out.Orientation, vOrientation ); +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingOrientedBox::Transform( BoundingOrientedBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const +{ + assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) ); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Composite the box rotation and the transform rotation. + vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); + + // Transform the center. + XMVECTOR VectorScale = XMVectorReplicate( Scale ); + vCenter = XMVector3Rotate( vCenter * VectorScale, Rotation ) + Translation; + + // Scale the box extents. + vExtents = vExtents * VectorScale; + + // Store the box. + XMStoreFloat3( &Out.Center, vCenter ); + XMStoreFloat3( &Out.Extents, vExtents ); + XMStoreFloat4( &Out.Orientation, vOrientation ); +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::GetCorners( XMFLOAT3* Corners ) const +{ + assert( Corners != 0 ); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + for( size_t i = 0; i < CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3Rotate( vExtents * g_BoxOffset[i], vOrientation ) + vCenter; + XMStoreFloat3( &Corners[i], C ); + } +} + + +//----------------------------------------------------------------------------- +// Point in oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains( FXMVECTOR Point ) const +{ + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Transform the point to be local to the box. + XMVECTOR TPoint = XMVector3InverseRotate( Point - vCenter, vOrientation ); + + return XMVector3InBounds( TPoint, vExtents ) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingOrientedBox::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Load the box center & orientation. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Transform the triangle vertices into the space of the box. + XMVECTOR TV0 = XMVector3InverseRotate( V0 - vCenter, vOrientation ); + XMVECTOR TV1 = XMVector3InverseRotate( V1 - vCenter, vOrientation ); + XMVECTOR TV2 = XMVector3InverseRotate( V2 - vCenter, vOrientation ); + + BoundingBox box; + box.Center = XMFLOAT3( 0.0f, 0.0f, 0.0f ); + box.Extents = Extents; + + // Use the triangle vs axis aligned box intersection routine. + return box.Contains( TV0, TV1, TV2 ); +} + + +//----------------------------------------------------------------------------- +// Sphere in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( const BoundingSphere& sh ) const +{ + XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR BoxCenter = XMLoadFloat3( &Center ); + XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Transform the center of the sphere to be local to the box. + // BoxMin = -BoxExtents + // BoxMax = +BoxExtents + SphereCenter = XMVector3InverseRotate( SphereCenter - BoxCenter, BoxOrientation ); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess( SphereCenter, -BoxExtents ); + XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxExtents ); + + XMVECTOR MinDelta = SphereCenter + BoxExtents; + XMVECTOR MaxDelta = SphereCenter - BoxExtents; + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect( d, MinDelta, LessThanMin ); + d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot( d, d ); + XMVECTOR SphereRadiusSq = XMVectorMultiply( SphereRadius, SphereRadius ); + + if ( XMVector4Greater( d2, SphereRadiusSq ) ) + return DISJOINT; + + // See if we are completely inside the box + XMVECTOR SMin = SphereCenter - SphereRadius; + XMVECTOR SMax = SphereCenter + SphereRadius; + + return ( XMVector3InBounds( SMin, BoxExtents ) && XMVector3InBounds( SMax, BoxExtents ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Axis aligned box vs. oriented box. Constructs an oriented box and uses +// the oriented box vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( const BoundingBox& box ) const +{ + // Make the axis aligned box oriented and do an OBB vs OBB test. + BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) ); + return Contains( obox ); +} + + +//----------------------------------------------------------------------------- +// Oriented bounding box in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( const BoundingOrientedBox& box ) const +{ + if ( !Intersects(box) ) + return DISJOINT; + + // Load the boxes + XMVECTOR aCenter = XMLoadFloat3( &Center ); + XMVECTOR aExtents = XMLoadFloat3( &Extents ); + XMVECTOR aOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( aOrientation ) ); + + XMVECTOR bCenter = XMLoadFloat3( &box.Center ); + XMVECTOR bExtents = XMLoadFloat3( &box.Extents ); + XMVECTOR bOrientation = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( bOrientation ) ); + + XMVECTOR offset = bCenter - aCenter; + + for( size_t i = 0; i < CORNER_COUNT; ++i ) + { + // Cb = rotate( bExtents * corneroffset[i], bOrientation ) + bcenter + // Ca = invrotate( Cb - aCenter, aOrientation ) + + XMVECTOR C = XMVector3Rotate( bExtents * g_BoxOffset[i], bOrientation ) + offset; + C = XMVector3InverseRotate( C , aOrientation ); + + if ( !XMVector3InBounds( C, aExtents ) ) + return INTERSECTS; + } + + return CONTAINS; +} + + +//----------------------------------------------------------------------------- +// Frustum in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( const BoundingFrustum& fr ) const +{ + if ( !fr.Intersects(*this) ) + return DISJOINT; + + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners( Corners ); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + for( size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3InverseRotate( XMLoadFloat3( &Corners[i] ) - vCenter, vOrientation ); + + if ( !XMVector3InBounds( C, vExtents ) ) + return INTERSECTS; + } + + return CONTAINS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs. oriented box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( const BoundingSphere& sh ) const +{ + XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR BoxCenter = XMLoadFloat3( &Center ); + XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Transform the center of the sphere to be local to the box. + // BoxMin = -BoxExtents + // BoxMax = +BoxExtents + SphereCenter = XMVector3InverseRotate( SphereCenter - BoxCenter, BoxOrientation ); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess( SphereCenter, -BoxExtents ); + XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxExtents ); + + XMVECTOR MinDelta = SphereCenter + BoxExtents; + XMVECTOR MaxDelta = SphereCenter - BoxExtents; + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect( d, MinDelta, LessThanMin ); + d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot( d, d ); + + return XMVector4LessOrEqual( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ) ? true : false; +} + + +//----------------------------------------------------------------------------- +// Axis aligned box vs. oriented box. Constructs an oriented box and uses +// the oriented box vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( const BoundingBox& box ) const +{ + // Make the axis aligned box oriented and do an OBB vs OBB test. + BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) ); + return Intersects( obox ); +} + + +//----------------------------------------------------------------------------- +// Fast oriented box / oriented box intersection test using the separating axis +// theorem. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( const BoundingOrientedBox& box ) const +{ + // Build the 3x3 rotation matrix that defines the orientation of B relative to A. + XMVECTOR A_quat = XMLoadFloat4( &Orientation ); + XMVECTOR B_quat = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( A_quat ) ); + assert( DirectX::Internal::XMQuaternionIsUnit( B_quat ) ); + + XMVECTOR Q = XMQuaternionMultiply( A_quat, XMQuaternionConjugate( B_quat ) ); + XMMATRIX R = XMMatrixRotationQuaternion( Q ); + + // Compute the translation of B relative to A. + XMVECTOR A_cent = XMLoadFloat3( &Center ); + XMVECTOR B_cent = XMLoadFloat3( &box.Center ); + XMVECTOR t = XMVector3InverseRotate( B_cent - A_cent, A_quat ); + + // + // h(A) = extents of A. + // h(B) = extents of B. + // + // a(u) = axes of A = (1,0,0), (0,1,0), (0,0,1) + // b(u) = axes of B relative to A = (r00,r10,r20), (r01,r11,r21), (r02,r12,r22) + // + // For each possible separating axis l: + // d(A) = sum (for i = u,v,w) h(A)(i) * abs( a(i) dot l ) + // d(B) = sum (for i = u,v,w) h(B)(i) * abs( b(i) dot l ) + // if abs( t dot l ) > d(A) + d(B) then disjoint + // + + // Load extents of A and B. + XMVECTOR h_A = XMLoadFloat3( &Extents ); + XMVECTOR h_B = XMLoadFloat3( &box.Extents ); + + // Rows. Note R[0,1,2]X.w = 0. + XMVECTOR R0X = R.r[0]; + XMVECTOR R1X = R.r[1]; + XMVECTOR R2X = R.r[2]; + + R = XMMatrixTranspose( R ); + + // Columns. Note RX[0,1,2].w = 0. + XMVECTOR RX0 = R.r[0]; + XMVECTOR RX1 = R.r[1]; + XMVECTOR RX2 = R.r[2]; + + // Absolute value of rows. + XMVECTOR AR0X = XMVectorAbs( R0X ); + XMVECTOR AR1X = XMVectorAbs( R1X ); + XMVECTOR AR2X = XMVectorAbs( R2X ); + + // Absolute value of columns. + XMVECTOR ARX0 = XMVectorAbs( RX0 ); + XMVECTOR ARX1 = XMVectorAbs( RX1 ); + XMVECTOR ARX2 = XMVectorAbs( RX2 ); + + // Test each of the 15 possible seperating axii. + XMVECTOR d, d_A, d_B; + + // l = a(u) = (1, 0, 0) + // t dot l = t.x + // d(A) = h(A).x + // d(B) = h(B) dot abs(r00, r01, r02) + d = XMVectorSplatX( t ); + d_A = XMVectorSplatX( h_A ); + d_B = XMVector3Dot( h_B, AR0X ); + XMVECTOR NoIntersection = XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ); + + // l = a(v) = (0, 1, 0) + // t dot l = t.y + // d(A) = h(A).y + // d(B) = h(B) dot abs(r10, r11, r12) + d = XMVectorSplatY( t ); + d_A = XMVectorSplatY( h_A ); + d_B = XMVector3Dot( h_B, AR1X ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(w) = (0, 0, 1) + // t dot l = t.z + // d(A) = h(A).z + // d(B) = h(B) dot abs(r20, r21, r22) + d = XMVectorSplatZ( t ); + d_A = XMVectorSplatZ( h_A ); + d_B = XMVector3Dot( h_B, AR2X ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = b(u) = (r00, r10, r20) + // d(A) = h(A) dot abs(r00, r10, r20) + // d(B) = h(B).x + d = XMVector3Dot( t, RX0 ); + d_A = XMVector3Dot( h_A, ARX0 ); + d_B = XMVectorSplatX( h_B ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = b(v) = (r01, r11, r21) + // d(A) = h(A) dot abs(r01, r11, r21) + // d(B) = h(B).y + d = XMVector3Dot( t, RX1 ); + d_A = XMVector3Dot( h_A, ARX1 ); + d_B = XMVectorSplatY( h_B ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = b(w) = (r02, r12, r22) + // d(A) = h(A) dot abs(r02, r12, r22) + // d(B) = h(B).z + d = XMVector3Dot( t, RX2 ); + d_A = XMVector3Dot( h_A, ARX2 ); + d_B = XMVectorSplatZ( h_B ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(u) x b(u) = (0, -r20, r10) + // d(A) = h(A) dot abs(0, r20, r10) + // d(B) = h(B) dot abs(0, r02, r01) + d = XMVector3Dot( t, XMVectorPermute( RX0, -RX0 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX0 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR0X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(u) x b(v) = (0, -r21, r11) + // d(A) = h(A) dot abs(0, r21, r11) + // d(B) = h(B) dot abs(r02, 0, r00) + d = XMVector3Dot( t, XMVectorPermute( RX1, -RX1 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX1 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR0X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(u) x b(w) = (0, -r22, r12) + // d(A) = h(A) dot abs(0, r22, r12) + // d(B) = h(B) dot abs(r01, r00, 0) + d = XMVector3Dot( t, XMVectorPermute( RX2, -RX2 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX2 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR0X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(v) x b(u) = (r20, 0, -r00) + // d(A) = h(A) dot abs(r20, 0, r00) + // d(B) = h(B) dot abs(0, r12, r11) + d = XMVector3Dot( t, XMVectorPermute( RX0, -RX0 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX0 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR1X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(v) x b(v) = (r21, 0, -r01) + // d(A) = h(A) dot abs(r21, 0, r01) + // d(B) = h(B) dot abs(r12, 0, r10) + d = XMVector3Dot( t, XMVectorPermute( RX1, -RX1 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX1 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR1X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(v) x b(w) = (r22, 0, -r02) + // d(A) = h(A) dot abs(r22, 0, r02) + // d(B) = h(B) dot abs(r11, r10, 0) + d = XMVector3Dot( t, XMVectorPermute( RX2, -RX2 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX2 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR1X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(w) x b(u) = (-r10, r00, 0) + // d(A) = h(A) dot abs(r10, r00, 0) + // d(B) = h(B) dot abs(0, r22, r21) + d = XMVector3Dot( t, XMVectorPermute( RX0, -RX0 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX0 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR2X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(w) x b(v) = (-r11, r01, 0) + // d(A) = h(A) dot abs(r11, r01, 0) + // d(B) = h(B) dot abs(r22, 0, r20) + d = XMVector3Dot( t, XMVectorPermute( RX1, -RX1 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX1 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR2X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(w) x b(w) = (-r12, r02, 0) + // d(A) = h(A) dot abs(r12, r02, 0) + // d(B) = h(B) dot abs(r21, r20, 0) + d = XMVector3Dot( t, XMVectorPermute( RX2, -RX2 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle( ARX2 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle( AR2X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // No seperating axis found, boxes must intersect. + return XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ) ? true : false; +} + + +//----------------------------------------------------------------------------- +// Frustum vs. oriented box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( const BoundingFrustum& fr ) const +{ + return fr.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Triangle vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Load the box center & orientation. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Transform the triangle vertices into the space of the box. + XMVECTOR TV0 = XMVector3InverseRotate( V0 - vCenter, vOrientation ); + XMVECTOR TV1 = XMVector3InverseRotate( V1 - vCenter, vOrientation ); + XMVECTOR TV2 = XMVector3InverseRotate( V2 - vCenter, vOrientation ); + + BoundingBox box; + box.Center = XMFLOAT3( 0.0f, 0.0f, 0.0f ); + box.Extents = Extents; + + // Use the triangle vs axis aligned box intersection routine. + return box.Intersects( TV0, TV1, TV2 ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR Plane ) const +{ + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation ); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane, Outside, Inside ); + + // If the box is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the box is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The box is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with an oriented box +// using the slabs method. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingOrientedBox::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const +{ + assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); + + static const XMVECTORU32 SelectY = + { + XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 + }; + static const XMVECTORU32 SelectZ = + { + XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 + }; + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Get the boxes normalized side directions. + XMMATRIX R = XMMatrixRotationQuaternion( vOrientation ); + + // Adjust ray origin to be relative to center of the box. + XMVECTOR TOrigin = vCenter - Origin; + + // Compute the dot product againt each axis of the box. + XMVECTOR AxisDotOrigin = XMVector3Dot( R.r[0], TOrigin ); + AxisDotOrigin = XMVectorSelect( AxisDotOrigin, XMVector3Dot( R.r[1], TOrigin ), SelectY ); + AxisDotOrigin = XMVectorSelect( AxisDotOrigin, XMVector3Dot( R.r[2], TOrigin ), SelectZ ); + + XMVECTOR AxisDotDirection = XMVector3Dot( R.r[0], Direction ); + AxisDotDirection = XMVectorSelect( AxisDotDirection, XMVector3Dot( R.r[1], Direction ), SelectY ); + AxisDotDirection = XMVectorSelect( AxisDotDirection, XMVector3Dot( R.r[2], Direction ), SelectZ ); + + // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab. + XMVECTOR IsParallel = XMVectorLessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ); + + // Test against all three axes simultaneously. + XMVECTOR InverseAxisDotDirection = XMVectorReciprocal( AxisDotDirection ); + XMVECTOR t1 = ( AxisDotOrigin - vExtents ) * InverseAxisDotDirection; + XMVECTOR t2 = ( AxisDotOrigin + vExtents ) * InverseAxisDotDirection; + + // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't + // use the results from any directions parallel to the slab. + XMVECTOR t_min = XMVectorSelect( XMVectorMin( t1, t2 ), g_FltMin, IsParallel ); + XMVECTOR t_max = XMVectorSelect( XMVectorMax( t1, t2 ), g_FltMax, IsParallel ); + + // t_min.x = maximum( t_min.x, t_min.y, t_min.z ); + // t_max.x = minimum( t_max.x, t_max.y, t_max.z ); + t_min = XMVectorMax( t_min, XMVectorSplatY( t_min ) ); // x = max(x,y) + t_min = XMVectorMax( t_min, XMVectorSplatZ( t_min ) ); // x = max(max(x,y),z) + t_max = XMVectorMin( t_max, XMVectorSplatY( t_max ) ); // x = min(x,y) + t_max = XMVectorMin( t_max, XMVectorSplatZ( t_max ) ); // x = min(min(x,y),z) + + // if ( t_min > t_max ) return false; + XMVECTOR NoIntersection = XMVectorGreater( XMVectorSplatX( t_min ), XMVectorSplatX( t_max ) ); + + // if ( t_max < 0.0f ) return false; + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorSplatX( t_max ), XMVectorZero() ) ); + + // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false; + XMVECTOR ParallelOverlap = XMVectorInBounds( AxisDotOrigin, vExtents ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorAndCInt( IsParallel, ParallelOverlap ) ); + + if( !DirectX::Internal::XMVector3AnyTrue( NoIntersection ) ) + { + // Store the x-component to *pDist + XMStoreFloat( &Dist, t_min ); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test an oriented box vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingOrientedBox::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation ); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane1, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane2, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane3, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane4, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane5, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the box is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the box is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The box is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Create oriented bounding box from axis-aligned bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::CreateFromBoundingBox( BoundingOrientedBox& Out, const BoundingBox& box ) +{ + Out.Center = box.Center; + Out.Extents = box.Extents; + Out.Orientation = XMFLOAT4( 0.f, 0.f, 0.f, 1.f ); +} + + +//----------------------------------------------------------------------------- +// Find the approximate minimum oriented bounding box containing a set of +// points. Exact computation of minimum oriented bounding box is possible but +// is slower and requires a more complex algorithm. +// The algorithm works by computing the inertia tensor of the points and then +// using the eigenvectors of the intertia tensor as the axes of the box. +// Computing the intertia tensor of the convex hull of the points will usually +// result in better bounding box but the computation is more complex. +// Exact computation of the minimum oriented bounding box is possible but the +// best know algorithm is O(N^3) and is significanly more complex to implement. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::CreateFromPoints( BoundingOrientedBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride ) +{ + assert( Count > 0 ); + assert( pPoints != 0 ); + + XMVECTOR CenterOfMass = XMVectorZero(); + + // Compute the center of mass and inertia tensor of the points. + for( size_t i = 0; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ); + + CenterOfMass += Point; + } + + CenterOfMass *= XMVectorReciprocal( XMVectorReplicate( float( Count ) ) ); + + // Compute the inertia tensor of the points around the center of mass. + // Using the center of mass is not strictly necessary, but will hopefully + // improve the stability of finding the eigenvectors. + XMVECTOR XX_YY_ZZ = XMVectorZero(); + XMVECTOR XY_XZ_YZ = XMVectorZero(); + + for( size_t i = 0; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ) - CenterOfMass; + + XX_YY_ZZ += Point * Point; + + XMVECTOR XXY = XMVectorSwizzle( Point ); + XMVECTOR YZZ = XMVectorSwizzle( Point ); + + XY_XZ_YZ += XXY * YZZ; + } + + XMVECTOR v1, v2, v3; + + // Compute the eigenvectors of the inertia tensor. + DirectX::Internal::CalculateEigenVectorsFromCovarianceMatrix( XMVectorGetX( XX_YY_ZZ ), XMVectorGetY( XX_YY_ZZ ), + XMVectorGetZ( XX_YY_ZZ ), + XMVectorGetX( XY_XZ_YZ ), XMVectorGetY( XY_XZ_YZ ), + XMVectorGetZ( XY_XZ_YZ ), + &v1, &v2, &v3 ); + + // Put them in a matrix. + XMMATRIX R; + + R.r[0] = XMVectorSetW( v1, 0.f ); + R.r[1] = XMVectorSetW( v2, 0.f ); + R.r[2] = XMVectorSetW( v3, 0.f ); + R.r[3] = g_XMIdentityR3.v; + + // Multiply by -1 to convert the matrix into a right handed coordinate + // system (Det ~= 1) in case the eigenvectors form a left handed + // coordinate system (Det ~= -1) because XMQuaternionRotationMatrix only + // works on right handed matrices. + XMVECTOR Det = XMMatrixDeterminant( R ); + + if( XMVector4Less( Det, XMVectorZero() ) ) + { + R.r[0] *= g_XMNegativeOne.v; + R.r[1] *= g_XMNegativeOne.v; + R.r[2] *= g_XMNegativeOne.v; + } + + // Get the rotation quaternion from the matrix. + XMVECTOR vOrientation = XMQuaternionRotationMatrix( R ); + + // Make sure it is normal (in case the vectors are slightly non-orthogonal). + vOrientation = XMQuaternionNormalize( vOrientation ); + + // Rebuild the rotation matrix from the quaternion. + R = XMMatrixRotationQuaternion( vOrientation ); + + // Build the rotation into the rotated space. + XMMATRIX InverseR = XMMatrixTranspose( R ); + + // Find the minimum OBB using the eigenvectors as the axes. + XMVECTOR vMin, vMax; + + vMin = vMax = XMVector3TransformNormal( XMLoadFloat3( pPoints ), InverseR ); + + for( size_t i = 1; i < Count; ++i ) + { + XMVECTOR Point = XMVector3TransformNormal( XMLoadFloat3( reinterpret_cast( reinterpret_cast(pPoints) + i * Stride ) ), + InverseR ); + + vMin = XMVectorMin( vMin, Point ); + vMax = XMVectorMax( vMax, Point ); + } + + // Rotate the center into world space. + XMVECTOR vCenter = ( vMin + vMax ) * 0.5f; + vCenter = XMVector3TransformNormal( vCenter, R ); + + // Store center, extents, and orientation. + XMStoreFloat3( &Out.Center, vCenter ); + XMStoreFloat3( &Out.Extents, ( vMax - vMin ) * 0.5f ); + XMStoreFloat4( &Out.Orientation, vOrientation ); +} + + +/**************************************************************************** + * + * BoundingFrustum + * + ****************************************************************************/ + +//----------------------------------------------------------------------------- +// Transform a frustum by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingFrustum::Transform( BoundingFrustum& Out, FXMMATRIX M ) const +{ + // Load the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Composite the frustum rotation and the transform rotation + XMMATRIX nM; + nM.r[0] = XMVector3Normalize( M.r[0] ); + nM.r[1] = XMVector3Normalize( M.r[1] ); + nM.r[2] = XMVector3Normalize( M.r[2] ); + nM.r[3] = g_XMIdentityR3; + XMVECTOR Rotation = XMQuaternionRotationMatrix( nM ); + vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); + + // Transform the center. + vOrigin = XMVector3Transform( vOrigin, M ); + + // Store the frustum. + XMStoreFloat3( &Out.Origin, vOrigin ); + XMStoreFloat4( &Out.Orientation, vOrientation ); + + // Scale the near and far distances (the slopes remain the same). + XMVECTOR dX = XMVector3Dot( M.r[0], M.r[0] ); + XMVECTOR dY = XMVector3Dot( M.r[1], M.r[1] ); + XMVECTOR dZ = XMVector3Dot( M.r[2], M.r[2] ); + + XMVECTOR d = XMVectorMax( dX, XMVectorMax( dY, dZ ) ); + float Scale = sqrtf( XMVectorGetX(d) ); + + Out.Near = Near * Scale; + Out.Far = Far * Scale; + + // Copy the slopes. + Out.RightSlope = RightSlope; + Out.LeftSlope = LeftSlope; + Out.TopSlope = TopSlope; + Out.BottomSlope = BottomSlope; +} + +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingFrustum::Transform( BoundingFrustum& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const +{ + assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) ); + + // Load the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Composite the frustum rotation and the transform rotation. + vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); + + // Transform the origin. + vOrigin = XMVector3Rotate( vOrigin * XMVectorReplicate( Scale ), Rotation ) + Translation; + + // Store the frustum. + XMStoreFloat3( &Out.Origin, vOrigin ); + XMStoreFloat4( &Out.Orientation, vOrientation ); + + // Scale the near and far distances (the slopes remain the same). + Out.Near = Near * Scale; + Out.Far = Far * Scale; + + // Copy the slopes. + Out.RightSlope = RightSlope; + Out.LeftSlope = LeftSlope; + Out.TopSlope = TopSlope; + Out.BottomSlope = BottomSlope; +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the frustum +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingFrustum::GetCorners( XMFLOAT3* Corners ) const +{ + assert( Corners != 0 ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + // Returns 8 corners position of bounding frustum. + // Near Far + // 0----1 4----5 + // | | | | + // | | | | + // 3----2 7----6 + + XMVECTOR vCorners[CORNER_COUNT]; + vCorners[0] = vLeftTop * vNear; + vCorners[1] = vRightTop * vNear; + vCorners[2] = vRightBottom * vNear; + vCorners[3] = vLeftBottom * vNear; + vCorners[4] = vLeftTop * vFar; + vCorners[5] = vRightTop * vFar; + vCorners[6] = vRightBottom * vFar; + vCorners[7] = vLeftBottom * vFar; + + for( size_t i=0; i < CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3Rotate( vCorners[i], vOrientation ) + vOrigin; + XMStoreFloat3( &Corners[i], C ); + } +} + + +//----------------------------------------------------------------------------- +// Point in frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingFrustum::Contains( FXMVECTOR Point ) const +{ + // Build frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Load origin and orientation. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Transform point into local space of frustum. + XMVECTOR TPoint = XMVector3InverseRotate( Point - vOrigin, vOrientation ); + + // Set w to one. + TPoint = XMVectorInsert<0, 0, 0, 0, 1>( TPoint, XMVectorSplatOne() ); + + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Outside = Zero; + + // Test point against each plane of the frustum. + for( size_t i = 0; i < 6; ++i ) + { + XMVECTOR Dot = XMVector4Dot( TPoint, Planes[i] ); + Outside = XMVectorOrInt( Outside, XMVectorGreater( Dot, Zero ) ); + } + + return XMVector4NotEqualInt( Outside, XMVectorTrueInt() ) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingFrustum::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return TriangleTests::ContainedBy( V0, V1, V2, NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( const BoundingSphere& sh ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return sh.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( const BoundingBox& box ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return box.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( const BoundingOrientedBox& box ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return box.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( const BoundingFrustum& fr ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return fr.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +// Exact sphere vs frustum test. The algorithm first checks the sphere against +// the planes of the frustum, then if the plane checks were indeterminate finds +// the nearest feature (plane, line, point) on the frustum to the center of the +// sphere and compares the distance to the nearest feature to the radius of the +// sphere +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( const BoundingSphere& sh ) const +{ + XMVECTOR Zero = XMVectorZero(); + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Normalize the planes so we can compare to the sphere radius. + Planes[2] = XMVector3Normalize( Planes[2] ); + Planes[3] = XMVector3Normalize( Planes[3] ); + Planes[4] = XMVector3Normalize( Planes[4] ); + Planes[5] = XMVector3Normalize( Planes[5] ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &sh.Radius ); + + // Transform the center of the sphere into the local space of frustum. + vCenter = XMVector3InverseRotate( vCenter - vOrigin, vOrientation ); + + // Set w of the center to one so we can dot4 with the plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + // Check against each plane of the frustum. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + XMVECTOR CenterInsideAll = XMVectorTrueInt(); + + XMVECTOR Dist[6]; + + for( size_t i = 0; i < 6; ++i ) + { + Dist[i] = XMVector4Dot( vCenter, Planes[i] ); + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist[i], vRadius ) ); + + // Fully inside the plane? + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Dist[i], -vRadius ) ); + + // Check if the center is inside the plane. + CenterInsideAll = XMVectorAndInt( CenterInsideAll, XMVectorLessOrEqual( Dist[i], Zero ) ); + } + + // If the sphere is outside any of the planes it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If the sphere is inside all planes it is fully inside. + if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) + return true; + + // If the center of the sphere is inside all planes and the sphere intersects + // one or more planes then it must intersect. + if ( XMVector4EqualInt( CenterInsideAll, XMVectorTrueInt() ) ) + return true; + + // The sphere may be outside the frustum or intersecting the frustum. + // Find the nearest feature (face, edge, or corner) on the frustum + // to the sphere. + + // The faces adjacent to each face are: + static const size_t adjacent_faces[6][4] = + { + { 2, 3, 4, 5 }, // 0 + { 2, 3, 4, 5 }, // 1 + { 0, 1, 4, 5 }, // 2 + { 0, 1, 4, 5 }, // 3 + { 0, 1, 2, 3 }, // 4 + { 0, 1, 2, 3 } + }; // 5 + + XMVECTOR Intersects = XMVectorFalseInt(); + + // Check to see if the nearest feature is one of the planes. + for( size_t i = 0; i < 6; ++i ) + { + // Find the nearest point on the plane to the center of the sphere. + XMVECTOR Point = vCenter - (Planes[i] * Dist[i]); + + // Set w of the point to one. + Point = XMVectorInsert<0, 0, 0, 0, 1>( Point, XMVectorSplatOne() ); + + // If the point is inside the face (inside the adjacent planes) then + // this plane is the nearest feature. + XMVECTOR InsideFace = XMVectorTrueInt(); + + for ( size_t j = 0; j < 4; j++ ) + { + size_t plane_index = adjacent_faces[i][j]; + + InsideFace = XMVectorAndInt( InsideFace, + XMVectorLessOrEqual( XMVector4Dot( Point, Planes[plane_index] ), Zero ) ); + } + + // Since we have already checked distance from the plane we know that the + // sphere must intersect if this plane is the nearest feature. + Intersects = XMVectorOrInt( Intersects, + XMVectorAndInt( XMVectorGreater( Dist[i], Zero ), InsideFace ) ); + } + + if ( XMVector4EqualInt( Intersects, XMVectorTrueInt() ) ) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = vRightTop * vNear; + Corners[1] = vRightBottom * vNear; + Corners[2] = vLeftTop * vNear; + Corners[3] = vLeftBottom * vNear; + Corners[4] = vRightTop * vFar; + Corners[5] = vRightBottom * vFar; + Corners[6] = vLeftTop * vFar; + Corners[7] = vLeftBottom * vFar; + + // The Edges are: + static const size_t edges[12][2] = + { + { 0, 1 }, { 2, 3 }, { 0, 2 }, { 1, 3 }, // Near plane + { 4, 5 }, { 6, 7 }, { 4, 6 }, { 5, 7 }, // Far plane + { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 }, + }; // Near to far + + XMVECTOR RadiusSq = vRadius * vRadius; + + // Check to see if the nearest feature is one of the edges (or corners). + for( size_t i = 0; i < 12; ++i ) + { + size_t ei0 = edges[i][0]; + size_t ei1 = edges[i][1]; + + // Find the nearest point on the edge to the center of the sphere. + // The corners of the frustum are included as the endpoints of the edges. + XMVECTOR Point = DirectX::Internal::PointOnLineSegmentNearestPoint( Corners[ei0], Corners[ei1], vCenter ); + + XMVECTOR Delta = vCenter - Point; + + XMVECTOR DistSq = XMVector3Dot( Delta, Delta ); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersects = XMVectorOrInt( Intersects, XMVectorLessOrEqual( DistSq, RadiusSq ) ); + } + + if ( XMVector4EqualInt( Intersects, XMVectorTrueInt() ) ) + return true; + + // The sphere must be outside the frustum. + return false; +} + + +//----------------------------------------------------------------------------- +// Exact axis aligned box vs frustum test. Constructs an oriented box and uses +// the oriented box vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( const BoundingBox& box ) const +{ + // Make the axis aligned box oriented and do an OBB vs frustum test. + BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) ); + return Intersects( obox ); +} + + +//----------------------------------------------------------------------------- +// Exact oriented box vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( const BoundingOrientedBox& box ) const +{ + static const XMVECTORU32 SelectY = + { + XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 + }; + static const XMVECTORU32 SelectZ = + { + XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 + }; + + XMVECTOR Zero = XMVectorZero(); + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR FrustumOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( FrustumOrientation ) ); + + // Load the box. + XMVECTOR Center = XMLoadFloat3( &box.Center ); + XMVECTOR Extents = XMLoadFloat3( &box.Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Transform the oriented box into the space of the frustum in order to + // minimize the number of transforms we have to do. + Center = XMVector3InverseRotate( Center - vOrigin, FrustumOrientation ); + BoxOrientation = XMQuaternionMultiply( BoxOrientation, XMQuaternionConjugate( FrustumOrientation ) ); + + // Set w of the center to one so we can dot4 with the plane. + Center = XMVectorInsert<0, 0, 0, 0, 1>( Center, XMVectorSplatOne() ); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation ); + + // Check against each plane of the frustum. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + XMVECTOR CenterInsideAll = XMVectorTrueInt(); + + for( size_t i = 0; i < 6; ++i ) + { + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot( Center, Planes[i] ); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. + XMVECTOR Radius = XMVector3Dot( Planes[i], R.r[0] ); + Radius = XMVectorSelect( Radius, XMVector3Dot( Planes[i], R.r[1] ), SelectY ); + Radius = XMVectorSelect( Radius, XMVector3Dot( Planes[i], R.r[2] ), SelectZ ); + Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) ); + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist, Radius ) ); + + // Fully inside the plane? + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Dist, -Radius ) ); + + // Check if the center is inside the plane. + CenterInsideAll = XMVectorAndInt( CenterInsideAll, XMVectorLessOrEqual( Dist, Zero ) ); + } + + // If the box is outside any of the planes it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If the box is inside all planes it is fully inside. + if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) + return true; + + // If the center of the box is inside all planes and the box intersects + // one or more planes then it must intersect. + if ( XMVector4EqualInt( CenterInsideAll, XMVectorTrueInt() ) ) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = vRightTop * vNear; + Corners[1] = vRightBottom * vNear; + Corners[2] = vLeftTop * vNear; + Corners[3] = vLeftBottom * vNear; + Corners[4] = vRightTop * vFar; + Corners[5] = vRightBottom * vFar; + Corners[6] = vLeftTop * vFar; + Corners[7] = vLeftBottom * vFar; + + // Test against box axes (3) + { + // Find the min/max values of the projection of the frustum onto each axis. + XMVECTOR FrustumMin, FrustumMax; + + FrustumMin = XMVector3Dot( Corners[0], R.r[0] ); + FrustumMin = XMVectorSelect( FrustumMin, XMVector3Dot( Corners[0], R.r[1] ), SelectY ); + FrustumMin = XMVectorSelect( FrustumMin, XMVector3Dot( Corners[0], R.r[2] ), SelectZ ); + FrustumMax = FrustumMin; + + for( size_t i = 1; i < BoundingOrientedBox::CORNER_COUNT; ++i ) + { + XMVECTOR Temp = XMVector3Dot( Corners[i], R.r[0] ); + Temp = XMVectorSelect( Temp, XMVector3Dot( Corners[i], R.r[1] ), SelectY ); + Temp = XMVectorSelect( Temp, XMVector3Dot( Corners[i], R.r[2] ), SelectZ ); + + FrustumMin = XMVectorMin( FrustumMin, Temp ); + FrustumMax = XMVectorMax( FrustumMax, Temp ); + } + + // Project the center of the box onto the axes. + XMVECTOR BoxDist = XMVector3Dot( Center, R.r[0] ); + BoxDist = XMVectorSelect( BoxDist, XMVector3Dot( Center, R.r[1] ), SelectY ); + BoxDist = XMVectorSelect( BoxDist, XMVector3Dot( Center, R.r[2] ), SelectZ ); + + // The projection of the box onto the axis is just its Center and Extents. + // if (min > box_max || max < box_min) reject; + XMVECTOR Result = XMVectorOrInt( XMVectorGreater( FrustumMin, BoxDist + Extents ), + XMVectorLess( FrustumMax, BoxDist - Extents ) ); + + if( DirectX::Internal::XMVector3AnyTrue( Result ) ) + return false; + } + + // Test against edge/edge axes (3*6). + XMVECTOR FrustumEdgeAxis[6]; + + FrustumEdgeAxis[0] = vRightTop; + FrustumEdgeAxis[1] = vRightBottom; + FrustumEdgeAxis[2] = vLeftTop; + FrustumEdgeAxis[3] = vLeftBottom; + FrustumEdgeAxis[4] = vRightTop - vLeftTop; + FrustumEdgeAxis[5] = vLeftBottom - vLeftTop; + + for( size_t i = 0; i < 3; ++i ) + { + for( size_t j = 0; j < 6; j++ ) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross( R.r[i], FrustumEdgeAxis[j] ); + + // Find the min/max values of the projection of the frustum onto the axis. + XMVECTOR FrustumMin, FrustumMax; + + FrustumMin = FrustumMax = XMVector3Dot( Axis, Corners[0] ); + + for( size_t k = 1; k < CORNER_COUNT; k++ ) + { + XMVECTOR Temp = XMVector3Dot( Axis, Corners[k] ); + FrustumMin = XMVectorMin( FrustumMin, Temp ); + FrustumMax = XMVectorMax( FrustumMax, Temp ); + } + + // Project the center of the box onto the axis. + XMVECTOR Dist = XMVector3Dot( Center, Axis ); + + // Project the axes of the box onto the axis to find the "radius" of the box. + XMVECTOR Radius = XMVector3Dot( Axis, R.r[0] ); + Radius = XMVectorSelect( Radius, XMVector3Dot( Axis, R.r[1] ), SelectY ); + Radius = XMVectorSelect( Radius, XMVector3Dot( Axis, R.r[2] ), SelectZ ); + Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) ); + + // if (center > max + radius || center < min - radius) reject; + Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist, FrustumMax + Radius ) ); + Outside = XMVectorOrInt( Outside, XMVectorLess( Dist, FrustumMin - Radius ) ); + } + } + + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If we did not find a separating plane then the box must intersect the frustum. + return true; +} + + +//----------------------------------------------------------------------------- +// Exact frustum vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( const BoundingFrustum& fr ) const +{ + // Load origin and orientation of frustum B. + XMVECTOR OriginB = XMLoadFloat3( &Origin ); + XMVECTOR OrientationB = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( OrientationB ) ); + + // Build the planes of frustum B. + XMVECTOR AxisB[6]; + AxisB[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, 0.0f ); + AxisB[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, 0.0f ); + AxisB[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + AxisB[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + AxisB[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + AxisB[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + XMVECTOR PlaneDistB[6]; + PlaneDistB[0] = -XMVectorReplicatePtr( &Near ); + PlaneDistB[1] = XMVectorReplicatePtr( &Far ); + PlaneDistB[2] = XMVectorZero(); + PlaneDistB[3] = XMVectorZero(); + PlaneDistB[4] = XMVectorZero(); + PlaneDistB[5] = XMVectorZero(); + + // Load origin and orientation of frustum A. + XMVECTOR OriginA = XMLoadFloat3( &fr.Origin ); + XMVECTOR OrientationA = XMLoadFloat4( &fr.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( OrientationA ) ); + + // Transform frustum A into the space of the frustum B in order to + // minimize the number of transforms we have to do. + OriginA = XMVector3InverseRotate( OriginA - OriginB, OrientationB ); + OrientationA = XMQuaternionMultiply( OrientationA, XMQuaternionConjugate( OrientationB ) ); + + // Build the corners of frustum A (in the local space of B). + XMVECTOR RightTopA = XMVectorSet( fr.RightSlope, fr.TopSlope, 1.0f, 0.0f ); + XMVECTOR RightBottomA = XMVectorSet( fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f ); + XMVECTOR LeftTopA = XMVectorSet(fr.LeftSlope,fr.TopSlope, 1.0f, 0.0f ); + XMVECTOR LeftBottomA = XMVectorSet( fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f ); + XMVECTOR NearA = XMVectorReplicatePtr( &fr.Near ); + XMVECTOR FarA = XMVectorReplicatePtr( &fr.Far ); + + RightTopA = XMVector3Rotate( RightTopA, OrientationA ); + RightBottomA = XMVector3Rotate( RightBottomA, OrientationA ); + LeftTopA = XMVector3Rotate( LeftTopA, OrientationA ); + LeftBottomA = XMVector3Rotate( LeftBottomA, OrientationA ); + + XMVECTOR CornersA[CORNER_COUNT]; + CornersA[0] = OriginA + RightTopA * NearA; + CornersA[1] = OriginA + RightBottomA * NearA; + CornersA[2] = OriginA + LeftTopA * NearA; + CornersA[3] = OriginA + LeftBottomA * NearA; + CornersA[4] = OriginA + RightTopA * FarA; + CornersA[5] = OriginA + RightBottomA * FarA; + CornersA[6] = OriginA + LeftTopA * FarA; + CornersA[7] = OriginA + LeftBottomA * FarA; + + // Check frustum A against each plane of frustum B. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + + for( size_t i = 0; i < 6; ++i ) + { + // Find the min/max projection of the frustum onto the plane normal. + XMVECTOR Min, Max; + + Min = Max = XMVector3Dot( AxisB[i], CornersA[0] ); + + for( size_t j = 1; j < CORNER_COUNT; j++ ) + { + XMVECTOR Temp = XMVector3Dot( AxisB[i], CornersA[j] ); + Min = XMVectorMin( Min, Temp ); + Max = XMVectorMax( Max, Temp ); + } + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( Min, PlaneDistB[i] ) ); + + // Fully inside the plane? + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Max, PlaneDistB[i] ) ); + } + + // If the frustum A is outside any of the planes of frustum B it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If frustum A is inside all planes of frustum B it is fully inside. + if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) + return true; + + // Build the corners of frustum B. + XMVECTOR RightTopB = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR RightBottomB = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR LeftTopB = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR LeftBottomB = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR NearB = XMVectorReplicatePtr( &Near ); + XMVECTOR FarB = XMVectorReplicatePtr( &Far ); + + XMVECTOR CornersB[BoundingFrustum::CORNER_COUNT]; + CornersB[0] = RightTopB * NearB; + CornersB[1] = RightBottomB * NearB; + CornersB[2] = LeftTopB * NearB; + CornersB[3] = LeftBottomB * NearB; + CornersB[4] = RightTopB * FarB; + CornersB[5] = RightBottomB * FarB; + CornersB[6] = LeftTopB * FarB; + CornersB[7] = LeftBottomB * FarB; + + // Build the planes of frustum A (in the local space of B). + XMVECTOR AxisA[6]; + XMVECTOR PlaneDistA[6]; + + AxisA[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, 0.0f ); + AxisA[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, 0.0f ); + AxisA[2] = XMVectorSet( 1.0f, 0.0f, -fr.RightSlope, 0.0f ); + AxisA[3] = XMVectorSet( -1.0f, 0.0f, fr.LeftSlope, 0.0f ); + AxisA[4] = XMVectorSet( 0.0f, 1.0f, -fr.TopSlope, 0.0f ); + AxisA[5] = XMVectorSet( 0.0f, -1.0f, fr.BottomSlope, 0.0f ); + + AxisA[0] = XMVector3Rotate( AxisA[0], OrientationA ); + AxisA[1] = -AxisA[0]; + AxisA[2] = XMVector3Rotate( AxisA[2], OrientationA ); + AxisA[3] = XMVector3Rotate( AxisA[3], OrientationA ); + AxisA[4] = XMVector3Rotate( AxisA[4], OrientationA ); + AxisA[5] = XMVector3Rotate( AxisA[5], OrientationA ); + + PlaneDistA[0] = XMVector3Dot( AxisA[0], CornersA[0] ); // Re-use corner on near plane. + PlaneDistA[1] = XMVector3Dot( AxisA[1], CornersA[4] ); // Re-use corner on far plane. + PlaneDistA[2] = XMVector3Dot( AxisA[2], OriginA ); + PlaneDistA[3] = XMVector3Dot( AxisA[3], OriginA ); + PlaneDistA[4] = XMVector3Dot( AxisA[4], OriginA ); + PlaneDistA[5] = XMVector3Dot( AxisA[5], OriginA ); + + // Check each axis of frustum A for a seperating plane (5). + for( size_t i = 0; i < 6; ++i ) + { + // Find the minimum projection of the frustum onto the plane normal. + XMVECTOR Min; + + Min = XMVector3Dot( AxisA[i], CornersB[0] ); + + for( size_t j = 1; j < CORNER_COUNT; j++ ) + { + XMVECTOR Temp = XMVector3Dot( AxisA[i], CornersB[j] ); + Min = XMVectorMin( Min, Temp ); + } + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( Min, PlaneDistA[i] ) ); + } + + // If the frustum B is outside any of the planes of frustum A it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // Check edge/edge axes (6 * 6). + XMVECTOR FrustumEdgeAxisA[6]; + FrustumEdgeAxisA[0] = RightTopA; + FrustumEdgeAxisA[1] = RightBottomA; + FrustumEdgeAxisA[2] = LeftTopA; + FrustumEdgeAxisA[3] = LeftBottomA; + FrustumEdgeAxisA[4] = RightTopA - LeftTopA; + FrustumEdgeAxisA[5] = LeftBottomA - LeftTopA; + + XMVECTOR FrustumEdgeAxisB[6]; + FrustumEdgeAxisB[0] = RightTopB; + FrustumEdgeAxisB[1] = RightBottomB; + FrustumEdgeAxisB[2] = LeftTopB; + FrustumEdgeAxisB[3] = LeftBottomB; + FrustumEdgeAxisB[4] = RightTopB - LeftTopB; + FrustumEdgeAxisB[5] = LeftBottomB - LeftTopB; + + for( size_t i = 0; i < 6; ++i ) + { + for( size_t j = 0; j < 6; j++ ) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross( FrustumEdgeAxisA[i], FrustumEdgeAxisB[j] ); + + // Find the min/max values of the projection of both frustums onto the axis. + XMVECTOR MinA, MaxA; + XMVECTOR MinB, MaxB; + + MinA = MaxA = XMVector3Dot( Axis, CornersA[0] ); + MinB = MaxB = XMVector3Dot( Axis, CornersB[0] ); + + for( size_t k = 1; k < CORNER_COUNT; k++ ) + { + XMVECTOR TempA = XMVector3Dot( Axis, CornersA[k] ); + MinA = XMVectorMin( MinA, TempA ); + MaxA = XMVectorMax( MaxA, TempA ); + + XMVECTOR TempB = XMVector3Dot( Axis, CornersB[k] ); + MinB = XMVectorMin( MinB, TempB ); + MaxB = XMVectorMax( MaxB, TempB ); + } + + // if (MinA > MaxB || MinB > MaxA) reject + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinA, MaxB ) ); + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinB, MaxA ) ); + } + } + + // If there is a seperating plane, then the frustums do not intersect. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If we did not find a separating plane then the frustums intersect. + return true; +} + + +//----------------------------------------------------------------------------- +// Triangle vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Build the frustum planes (NOTE: D is negated from the usual). + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, -Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Transform triangle into the local space of frustum. + XMVECTOR TV0 = XMVector3InverseRotate( V0 - vOrigin, vOrientation ); + XMVECTOR TV1 = XMVector3InverseRotate( V1 - vOrigin, vOrientation ); + XMVECTOR TV2 = XMVector3InverseRotate( V2 - vOrigin, vOrientation ); + + // Test each vertex of the triangle against the frustum planes. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + + for( size_t i = 0; i < 6; ++i ) + { + XMVECTOR Dist0 = XMVector3Dot( TV0, Planes[i] ); + XMVECTOR Dist1 = XMVector3Dot( TV1, Planes[i] ); + XMVECTOR Dist2 = XMVector3Dot( TV2, Planes[i] ); + + XMVECTOR MinDist = XMVectorMin( Dist0, Dist1 ); + MinDist = XMVectorMin( MinDist, Dist2 ); + XMVECTOR MaxDist = XMVectorMax( Dist0, Dist1 ); + MaxDist = XMVectorMax( MaxDist, Dist2 ); + + XMVECTOR PlaneDist = XMVectorSplatW( Planes[i] ); + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinDist, PlaneDist ) ); + + // Fully inside the plane? + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( MaxDist, PlaneDist ) ); + } + + // If the triangle is outside any of the planes it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If the triangle is inside all planes it is fully inside. + if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = vRightTop * vNear; + Corners[1] = vRightBottom * vNear; + Corners[2] = vLeftTop * vNear; + Corners[3] = vLeftBottom * vNear; + Corners[4] = vRightTop * vFar; + Corners[5] = vRightBottom * vFar; + Corners[6] = vLeftTop * vFar; + Corners[7] = vLeftBottom * vFar; + + // Test the plane of the triangle. + XMVECTOR Normal = XMVector3Cross( V1 - V0, V2 - V0 ); + XMVECTOR Dist = XMVector3Dot( Normal, V0 ); + + XMVECTOR MinDist, MaxDist; + MinDist = MaxDist = XMVector3Dot( Corners[0], Normal ); + for( size_t i = 1; i < CORNER_COUNT; ++i ) + { + XMVECTOR Temp = XMVector3Dot( Corners[i], Normal ); + MinDist = XMVectorMin( MinDist, Temp ); + MaxDist = XMVectorMax( MaxDist, Temp ); + } + + Outside = XMVectorOrInt( XMVectorGreater( MinDist, Dist ), XMVectorLess( MaxDist, Dist ) ); + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // Check the edge/edge axes (3*6). + XMVECTOR TriangleEdgeAxis[3]; + TriangleEdgeAxis[0] = V1 - V0; + TriangleEdgeAxis[1] = V2 - V1; + TriangleEdgeAxis[2] = V0 - V2; + + XMVECTOR FrustumEdgeAxis[6]; + FrustumEdgeAxis[0] = vRightTop; + FrustumEdgeAxis[1] = vRightBottom; + FrustumEdgeAxis[2] = vLeftTop; + FrustumEdgeAxis[3] = vLeftBottom; + FrustumEdgeAxis[4] = vRightTop - vLeftTop; + FrustumEdgeAxis[5] = vLeftBottom - vLeftTop; + + for( size_t i = 0; i < 3; ++i ) + { + for( size_t j = 0; j < 6; j++ ) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross( TriangleEdgeAxis[i], FrustumEdgeAxis[j] ); + + // Find the min/max of the projection of the triangle onto the axis. + XMVECTOR MinA, MaxA; + + XMVECTOR Dist0 = XMVector3Dot( V0, Axis ); + XMVECTOR Dist1 = XMVector3Dot( V1, Axis ); + XMVECTOR Dist2 = XMVector3Dot( V2, Axis ); + + MinA = XMVectorMin( Dist0, Dist1 ); + MinA = XMVectorMin( MinA, Dist2 ); + MaxA = XMVectorMax( Dist0, Dist1 ); + MaxA = XMVectorMax( MaxA, Dist2 ); + + // Find the min/max of the projection of the frustum onto the axis. + XMVECTOR MinB, MaxB; + + MinB = MaxB = XMVector3Dot( Axis, Corners[0] ); + + for( size_t k = 1; k < CORNER_COUNT; k++ ) + { + XMVECTOR Temp = XMVector3Dot( Axis, Corners[k] ); + MinB = XMVectorMin( MinB, Temp ); + MaxB = XMVectorMax( MaxB, Temp ); + } + + // if (MinA > MaxB || MinB > MaxA) reject; + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinA, MaxB ) ); + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinB, MaxA ) ); + } + } + + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If we did not find a separating plane then the triangle must intersect the frustum. + return true; +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR Plane ) const +{ + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Set w of the origin to one so we can dot4 with a plane. + vOrigin = XMVectorInsert<0, 0, 0, 0, 1>( vOrigin, XMVectorSplatOne() ); + + // Build the corners of the frustum (in world space). + XMVECTOR RightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR RightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR LeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR LeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + RightTop = XMVector3Rotate( RightTop, vOrientation ); + RightBottom = XMVector3Rotate( RightBottom, vOrientation ); + LeftTop = XMVector3Rotate( LeftTop, vOrientation ); + LeftBottom = XMVector3Rotate( LeftBottom, vOrientation ); + + XMVECTOR Corners0 = vOrigin + RightTop * vNear; + XMVECTOR Corners1 = vOrigin + RightBottom * vNear; + XMVECTOR Corners2 = vOrigin + LeftTop * vNear; + XMVECTOR Corners3 = vOrigin + LeftBottom * vNear; + XMVECTOR Corners4 = vOrigin + RightTop * vFar; + XMVECTOR Corners5 = vOrigin + RightBottom * vFar; + XMVECTOR Corners6 = vOrigin + LeftTop * vFar; + XMVECTOR Corners7 = vOrigin + LeftBottom * vFar; + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane, Outside, Inside ); + + // If the frustum is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the frustum is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The frustum is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Ray vs. frustum test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV BoundingFrustum::Intersects( FXMVECTOR rayOrigin, FXMVECTOR Direction, float& Dist ) const +{ + // If ray starts inside the frustum, return a distance of 0 for the hit + if ( Contains(rayOrigin) == CONTAINS ) + { + Dist = 0.0f; + return true; + } + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Load origin and orientation of the frustum. + XMVECTOR frOrigin = XMLoadFloat3( &Origin ); + XMVECTOR frOrientation = XMLoadFloat4( &Orientation ); + + // This algorithm based on "Fast Ray-Convex Polyhedron Intersectin," in James Arvo, ed., Graphics Gems II pp. 247-250 + float tnear = -FLT_MAX; + float tfar = FLT_MAX; + + for( size_t i=0; i < 6; ++i ) + { + XMVECTOR Plane = DirectX::Internal::XMPlaneTransform( Planes[i], frOrientation, frOrigin ); + Plane = XMPlaneNormalize( Plane ); + + XMVECTOR AxisDotOrigin = XMPlaneDotCoord( Plane, rayOrigin ); + XMVECTOR AxisDotDirection = XMVector3Dot( Plane, Direction ); + + if ( XMVector3LessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ) ) + { + // Ray is parallel to plane - check if ray origin is inside plane's + if ( XMVector3Greater( AxisDotOrigin, g_XMZero ) ) + { + // Ray origin is outside half-space. + Dist = 0.f; + return false; + } + } + else + { + // Ray not parallel - get distance to plane. + float vd = XMVectorGetX( AxisDotDirection ); + float vn = XMVectorGetX( AxisDotOrigin ); + float t = -vn / vd; + if (vd < 0.0f) + { + // Front face - T is a near point. + if (t > tfar) + { + Dist = 0.f; + return false; + } + if (t > tnear) + { + // Hit near face. + tnear = t; + } + } + else + { + // back face - T is far point. + if (t < tnear) + { + Dist = 0.f; + return false; + } + if (t < tfar) + { + // Hit far face. + tfar = t; + } + } + } + } + + // Survived all tests. + // Note: if ray originates on polyhedron, may want to change 0.0f to some + // epsilon to avoid intersecting the originating face. + float distance = ( tnear >= 0.0f ) ? tnear : tfar; + if (distance >= 0.0f) + { + Dist = distance; + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test a frustum vs 6 planes (typically forming another frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV BoundingFrustum::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, HXMVECTOR Plane4, HXMVECTOR Plane5 ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Set w of the origin to one so we can dot4 with a plane. + vOrigin = XMVectorInsert<0, 0, 0, 0, 1>( vOrigin, XMVectorSplatOne() ); + + // Build the corners of the frustum (in world space). + XMVECTOR RightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR RightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR LeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR LeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + RightTop = XMVector3Rotate( RightTop, vOrientation ); + RightBottom = XMVector3Rotate( RightBottom, vOrientation ); + LeftTop = XMVector3Rotate( LeftTop, vOrientation ); + LeftBottom = XMVector3Rotate( LeftBottom, vOrientation ); + + XMVECTOR Corners0 = vOrigin + RightTop * vNear; + XMVECTOR Corners1 = vOrigin + RightBottom * vNear; + XMVECTOR Corners2 = vOrigin + LeftTop * vNear; + XMVECTOR Corners3 = vOrigin + LeftBottom * vNear; + XMVECTOR Corners4 = vOrigin + RightTop * vFar; + XMVECTOR Corners5 = vOrigin + RightBottom * vFar; + XMVECTOR Corners6 = vOrigin + LeftTop * vFar; + XMVECTOR Corners7 = vOrigin + LeftBottom * vFar; + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane1, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane2, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane3, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane4, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane5, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the frustum is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the frustum is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The frustum is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Build the 6 frustum planes from a frustum. +// +// The intended use for these routines is for fast culling to a view frustum. +// When the volume being tested against a view frustum is small relative to the +// view frustum it is usually either inside all six planes of the frustum +// (CONTAINS) or outside one of the planes of the frustum (DISJOINT). If neither +// of these cases is true then it may or may not be intersecting the frustum +// (INTERSECTS) +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingFrustum::GetPlanes( XMVECTOR* NearPlane, XMVECTOR* FarPlane, XMVECTOR* RightPlane, + XMVECTOR* LeftPlane, XMVECTOR* TopPlane, XMVECTOR* BottomPlane ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + if (NearPlane) + { + XMVECTOR vNearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + vNearPlane = DirectX::Internal::XMPlaneTransform( vNearPlane, vOrientation, vOrigin ); + *NearPlane = XMPlaneNormalize( vNearPlane ); + } + + if (FarPlane) + { + XMVECTOR vFarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + vFarPlane = DirectX::Internal::XMPlaneTransform( vFarPlane, vOrientation, vOrigin ); + *FarPlane = XMPlaneNormalize( vFarPlane ); + } + + if (RightPlane) + { + XMVECTOR vRightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + vRightPlane = DirectX::Internal::XMPlaneTransform( vRightPlane, vOrientation, vOrigin ); + *RightPlane = XMPlaneNormalize( vRightPlane ); + } + + if (LeftPlane) + { + XMVECTOR vLeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + vLeftPlane = DirectX::Internal::XMPlaneTransform( vLeftPlane, vOrientation, vOrigin ); + *LeftPlane = XMPlaneNormalize( vLeftPlane ); + } + + if (TopPlane) + { + XMVECTOR vTopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + vTopPlane = DirectX::Internal::XMPlaneTransform( vTopPlane, vOrientation, vOrigin ); + *TopPlane = XMPlaneNormalize( vTopPlane ); + } + + if (BottomPlane) + { + XMVECTOR vBottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + vBottomPlane = DirectX::Internal::XMPlaneTransform( vBottomPlane, vOrientation, vOrigin ); + *BottomPlane = XMPlaneNormalize( vBottomPlane ); + } +} + + +//----------------------------------------------------------------------------- +// Build a frustum from a persepective projection matrix. The matrix may only +// contain a projection; any rotation, translation or scale will cause the +// constructed frustum to be incorrect. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void XM_CALLCONV BoundingFrustum::CreateFromMatrix( BoundingFrustum& Out, FXMMATRIX Projection ) +{ + // Corners of the projection frustum in homogenous space. + static XMVECTORF32 HomogenousPoints[6] = + { + { 1.0f, 0.0f, 1.0f, 1.0f }, // right (at far plane) + { -1.0f, 0.0f, 1.0f, 1.0f }, // left + { 0.0f, 1.0f, 1.0f, 1.0f }, // top + { 0.0f, -1.0f, 1.0f, 1.0f }, // bottom + + { 0.0f, 0.0f, 0.0f, 1.0f }, // near + { 0.0f, 0.0f, 1.0f, 1.0f } // far + }; + + XMVECTOR Determinant; + XMMATRIX matInverse = XMMatrixInverse( &Determinant, Projection ); + + // Compute the frustum corners in world space. + XMVECTOR Points[6]; + + for( size_t i = 0; i < 6; ++i ) + { + // Transform point. + Points[i] = XMVector4Transform( HomogenousPoints[i], matInverse ); + } + + Out.Origin = XMFLOAT3( 0.0f, 0.0f, 0.0f ); + Out.Orientation = XMFLOAT4( 0.0f, 0.0f, 0.0f, 1.0f ); + + // Compute the slopes. + Points[0] = Points[0] * XMVectorReciprocal( XMVectorSplatZ( Points[0] ) ); + Points[1] = Points[1] * XMVectorReciprocal( XMVectorSplatZ( Points[1] ) ); + Points[2] = Points[2] * XMVectorReciprocal( XMVectorSplatZ( Points[2] ) ); + Points[3] = Points[3] * XMVectorReciprocal( XMVectorSplatZ( Points[3] ) ); + + Out.RightSlope = XMVectorGetX( Points[0] ); + Out.LeftSlope = XMVectorGetX( Points[1] ); + Out.TopSlope = XMVectorGetY( Points[2] ); + Out.BottomSlope = XMVectorGetY( Points[3] ); + + // Compute near and far. + Points[4] = Points[4] * XMVectorReciprocal( XMVectorSplatW( Points[4] ) ); + Points[5] = Points[5] * XMVectorReciprocal( XMVectorSplatW( Points[5] ) ); + + Out.Near = XMVectorGetZ( Points[4] ); + Out.Far = XMVectorGetZ( Points[5] ); +} + + +/**************************************************************************** + * + * TriangleTests + * + ****************************************************************************/ + +namespace TriangleTests +{ + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with a triangle +// (V0, V1, V2). Return true if there is an intersection and also set *pDist +// to the distance along the ray to the intersection. +// +// The algorithm is based on Moller, Tomas and Trumbore, "Fast, Minimum Storage +// Ray-Triangle Intersection", Journal of Graphics Tools, vol. 2, no. 1, +// pp 21-28, 1997. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV Intersects( FXMVECTOR Origin, FXMVECTOR Direction, FXMVECTOR V0, GXMVECTOR V1, HXMVECTOR V2, float& Dist ) +{ + assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); + + XMVECTOR Zero = XMVectorZero(); + + XMVECTOR e1 = V1 - V0; + XMVECTOR e2 = V2 - V0; + + // p = Direction ^ e2; + XMVECTOR p = XMVector3Cross( Direction, e2 ); + + // det = e1 * p; + XMVECTOR det = XMVector3Dot( e1, p ); + + XMVECTOR u, v, t; + + if( XMVector3GreaterOrEqual( det, g_RayEpsilon ) ) + { + // Determinate is positive (front side of the triangle). + XMVECTOR s = Origin - V0; + + // u = s * p; + u = XMVector3Dot( s, p ); + + XMVECTOR NoIntersection = XMVectorLess( u, Zero ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( u, det ) ); + + // q = s ^ e1; + XMVECTOR q = XMVector3Cross( s, e1 ); + + // v = Direction * q; + v = XMVector3Dot( Direction, q ); + + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( v, Zero ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( u + v, det ) ); + + // t = e2 * q; + t = XMVector3Dot( e2, q ); + + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( t, Zero ) ); + + if( XMVector4EqualInt( NoIntersection, XMVectorTrueInt() ) ) + { + Dist = 0.f; + return false; + } + } + else if( XMVector3LessOrEqual( det, g_RayNegEpsilon ) ) + { + // Determinate is negative (back side of the triangle). + XMVECTOR s = Origin - V0; + + // u = s * p; + u = XMVector3Dot( s, p ); + + XMVECTOR NoIntersection = XMVectorGreater( u, Zero ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( u, det ) ); + + // q = s ^ e1; + XMVECTOR q = XMVector3Cross( s, e1 ); + + // v = Direction * q; + v = XMVector3Dot( Direction, q ); + + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( v, Zero ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( u + v, det ) ); + + // t = e2 * q; + t = XMVector3Dot( e2, q ); + + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( t, Zero ) ); + + if ( XMVector4EqualInt( NoIntersection, XMVectorTrueInt() ) ) + { + Dist = 0.f; + return false; + } + } + else + { + // Parallel ray. + Dist = 0.f; + return false; + } + + t = XMVectorDivide ( t, det ); + + // (u / det) and (v / dev) are the barycentric cooridinates of the intersection. + + // Store the x-component to *pDist + XMStoreFloat( &Dist, t ); + + return true; +} + + +//----------------------------------------------------------------------------- +// Test if two triangles intersect. +// +// The final test of algorithm is based on Shen, Heng, and Tang, "A Fast +// Triangle-Triangle Overlap Test Using Signed Distances", Journal of Graphics +// Tools, vol. 8, no. 1, pp 17-23, 2003 and Guigue and Devillers, "Fast and +// Robust Triangle-Triangle Overlap Test Using Orientation Predicates", Journal +// of Graphics Tools, vol. 8, no. 1, pp 25-32, 2003. +// +// The final test could be considered an edge-edge separating plane test with +// the 9 possible cases narrowed down to the only two pairs of edges that can +// actaully result in a seperation. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool XM_CALLCONV Intersects( FXMVECTOR A0, FXMVECTOR A1, FXMVECTOR A2, GXMVECTOR B0, HXMVECTOR B1, HXMVECTOR B2 ) +{ + static const XMVECTORU32 SelectY = + { + XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 + }; + static const XMVECTORU32 SelectZ = + { + XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 + }; + static const XMVECTORU32 Select0111 = + { + XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_1 + }; + static const XMVECTORU32 Select1011 = + { + XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 + }; + static const XMVECTORU32 Select1101 = + { + XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1 + }; + + XMVECTOR Zero = XMVectorZero(); + + // Compute the normal of triangle A. + XMVECTOR N1 = XMVector3Cross( A1 - A0, A2 - A0 ); + + // Assert that the triangle is not degenerate. + assert( !XMVector3Equal( N1, Zero ) ); + + // Test points of B against the plane of A. + XMVECTOR BDist = XMVector3Dot( N1, B0 - A0 ); + BDist = XMVectorSelect( BDist, XMVector3Dot( N1, B1 - A0 ), SelectY ); + BDist = XMVectorSelect( BDist, XMVector3Dot( N1, B2 - A0 ), SelectZ ); + + // Ensure robustness with co-planar triangles by zeroing small distances. + uint32_t BDistIsZeroCR; + XMVECTOR BDistIsZero = XMVectorGreaterR( &BDistIsZeroCR, g_RayEpsilon, XMVectorAbs( BDist ) ); + BDist = XMVectorSelect( BDist, Zero, BDistIsZero ); + + uint32_t BDistIsLessCR; + XMVECTOR BDistIsLess = XMVectorGreaterR( &BDistIsLessCR, Zero, BDist ); + + uint32_t BDistIsGreaterCR; + XMVECTOR BDistIsGreater = XMVectorGreaterR( &BDistIsGreaterCR, BDist, Zero ); + + // If all the points are on the same side we don't intersect. + if( XMComparisonAllTrue( BDistIsLessCR ) || XMComparisonAllTrue( BDistIsGreaterCR ) ) + return false; + + // Compute the normal of triangle B. + XMVECTOR N2 = XMVector3Cross( B1 - B0, B2 - B0 ); + + // Assert that the triangle is not degenerate. + assert( !XMVector3Equal( N2, Zero ) ); + + // Test points of A against the plane of B. + XMVECTOR ADist = XMVector3Dot( N2, A0 - B0 ); + ADist = XMVectorSelect( ADist, XMVector3Dot( N2, A1 - B0 ), SelectY ); + ADist = XMVectorSelect( ADist, XMVector3Dot( N2, A2 - B0 ), SelectZ ); + + // Ensure robustness with co-planar triangles by zeroing small distances. + uint32_t ADistIsZeroCR; + XMVECTOR ADistIsZero = XMVectorGreaterR( &ADistIsZeroCR, g_RayEpsilon, XMVectorAbs( BDist ) ); + ADist = XMVectorSelect( ADist, Zero, ADistIsZero ); + + uint32_t ADistIsLessCR; + XMVECTOR ADistIsLess = XMVectorGreaterR( &ADistIsLessCR, Zero, ADist ); + + uint32_t ADistIsGreaterCR; + XMVECTOR ADistIsGreater = XMVectorGreaterR( &ADistIsGreaterCR, ADist, Zero ); + + // If all the points are on the same side we don't intersect. + if( XMComparisonAllTrue( ADistIsLessCR ) || XMComparisonAllTrue( ADistIsGreaterCR ) ) + return false; + + // Special case for co-planar triangles. + if( XMComparisonAllTrue( ADistIsZeroCR ) || XMComparisonAllTrue( BDistIsZeroCR ) ) + { + XMVECTOR Axis, Dist, MinDist; + + // Compute an axis perpindicular to the edge (points out). + Axis = XMVector3Cross( N1, A1 - A0 ); + Dist = XMVector3Dot( Axis, A0 ); + + // Test points of B against the axis. + MinDist = XMVector3Dot( B0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (A1, A2) + Axis = XMVector3Cross( N1, A2 - A1 ); + Dist = XMVector3Dot( Axis, A1 ); + + MinDist = XMVector3Dot( B0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (A2, A0) + Axis = XMVector3Cross( N1, A0 - A2 ); + Dist = XMVector3Dot( Axis, A2 ); + + MinDist = XMVector3Dot( B0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (B0, B1) + Axis = XMVector3Cross( N2, B1 - B0 ); + Dist = XMVector3Dot( Axis, B0 ); + + MinDist = XMVector3Dot( A0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (B1, B2) + Axis = XMVector3Cross( N2, B2 - B1 ); + Dist = XMVector3Dot( Axis, B1 ); + + MinDist = XMVector3Dot( A0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (B2,B0) + Axis = XMVector3Cross( N2, B0 - B2 ); + Dist = XMVector3Dot( Axis, B2 ); + + MinDist = XMVector3Dot( A0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + return true; + } + + // + // Find the single vertex of A and B (ie the vertex on the opposite side + // of the plane from the other two) and reorder the edges so we can compute + // the signed edge/edge distances. + // + // if ( (V0 >= 0 && V1 < 0 && V2 < 0) || + // (V0 > 0 && V1 <= 0 && V2 <= 0) || + // (V0 <= 0 && V1 > 0 && V2 > 0) || + // (V0 < 0 && V1 >= 0 && V2 >= 0) ) then V0 is singular; + // + // If our singular vertex is not on the positive side of the plane we reverse + // the triangle winding so that the overlap comparisons will compare the + // correct edges with the correct signs. + // + XMVECTOR ADistIsLessEqual = XMVectorOrInt( ADistIsLess, ADistIsZero ); + XMVECTOR ADistIsGreaterEqual = XMVectorOrInt( ADistIsGreater, ADistIsZero ); + + XMVECTOR AA0, AA1, AA2; + bool bPositiveA; + + if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select0111 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select0111 ) ) ) + { + // A0 is singular, crossing from positive to negative. + AA0 = A0; AA1 = A1; AA2 = A2; + bPositiveA = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select0111 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select0111 ) ) ) + { + // A0 is singular, crossing from negative to positive. + AA0 = A0; AA1 = A2; AA2 = A1; + bPositiveA = false; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select1011 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select1011 ) ) ) + { + // A1 is singular, crossing from positive to negative. + AA0 = A1; AA1 = A2; AA2 = A0; + bPositiveA = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select1011 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select1011 ) ) ) + { + // A1 is singular, crossing from negative to positive. + AA0 = A1; AA1 = A0; AA2 = A2; + bPositiveA = false; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select1101 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select1101 ) ) ) + { + // A2 is singular, crossing from positive to negative. + AA0 = A2; AA1 = A0; AA2 = A1; + bPositiveA = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select1101 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select1101 ) ) ) + { + // A2 is singular, crossing from negative to positive. + AA0 = A2; AA1 = A1; AA2 = A0; + bPositiveA = false; + } + else + { + assert( false ); + return false; + } + + XMVECTOR BDistIsLessEqual = XMVectorOrInt( BDistIsLess, BDistIsZero ); + XMVECTOR BDistIsGreaterEqual = XMVectorOrInt( BDistIsGreater, BDistIsZero ); + + XMVECTOR BB0, BB1, BB2; + bool bPositiveB; + + if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select0111 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select0111 ) ) ) + { + // B0 is singular, crossing from positive to negative. + BB0 = B0; BB1 = B1; BB2 = B2; + bPositiveB = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select0111 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select0111 ) ) ) + { + // B0 is singular, crossing from negative to positive. + BB0 = B0; BB1 = B2; BB2 = B1; + bPositiveB = false; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select1011 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select1011 ) ) ) + { + // B1 is singular, crossing from positive to negative. + BB0 = B1; BB1 = B2; BB2 = B0; + bPositiveB = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select1011 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select1011 ) ) ) + { + // B1 is singular, crossing from negative to positive. + BB0 = B1; BB1 = B0; BB2 = B2; + bPositiveB = false; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select1101 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select1101 ) ) ) + { + // B2 is singular, crossing from positive to negative. + BB0 = B2; BB1 = B0; BB2 = B1; + bPositiveB = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select1101 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select1101 ) ) ) + { + // B2 is singular, crossing from negative to positive. + BB0 = B2; BB1 = B1; BB2 = B0; + bPositiveB = false; + } + else + { + assert( false ); + return false; + } + + XMVECTOR Delta0, Delta1; + + // Reverse the direction of the test depending on whether the singular vertices are + // the same sign or different signs. + if( bPositiveA ^ bPositiveB ) + { + Delta0 = ( BB0 - AA0 ); + Delta1 = ( AA0 - BB0 ); + } + else + { + Delta0 = ( AA0 - BB0 ); + Delta1 = ( BB0 - AA0 ); + } + + // Check if the triangles overlap on the line of intersection between the + // planes of the two triangles by finding the signed line distances. + XMVECTOR Dist0 = XMVector3Dot( Delta0, XMVector3Cross( ( BB2 - BB0 ), ( AA2 - AA0 ) ) ); + if( XMVector4Greater( Dist0, Zero ) ) + return false; + + XMVECTOR Dist1 = XMVector3Dot( Delta1, XMVector3Cross( ( BB1 - BB0 ), ( AA1 - AA0 ) ) ); + if( XMVector4Greater( Dist1, Zero ) ) + return false; + + return true; +} + + +//----------------------------------------------------------------------------- +// Ray-triangle test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType XM_CALLCONV Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane ) +{ + XMVECTOR One = XMVectorSplatOne(); + + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Set w of the points to one so we can dot4 with a plane. + XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One); + XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One); + XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane, Outside, Inside ); + + // If the triangle is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the triangle is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The triangle is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Test a triangle vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType XM_CALLCONV ContainedBy( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, + GXMVECTOR Plane0, HXMVECTOR Plane1, HXMVECTOR Plane2, + CXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5 ) +{ + XMVECTOR One = XMVectorSplatOne(); + + // Set w of the points to one so we can dot4 with a plane. + XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One); + XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One); + XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane1, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane2, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane3, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane4, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane5, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the triangle is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the triangle is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The triangle is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + +}; // namespace TriangleTests + diff --git a/Inc/DirectXColors.h b/Inc/DirectXColors.h index c0ca2b3..13e33e7 100644 --- a/Inc/DirectXColors.h +++ b/Inc/DirectXColors.h @@ -1,169 +1,169 @@ -//------------------------------------------------------------------------------------- -// DirectXColors.h -- C++ Color Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#pragma once - -#include "DirectXMath.h" - -namespace DirectX -{ - -namespace Colors -{ - // Standard colors (Red/Green/Blue/Alpha) - XMGLOBALCONST XMVECTORF32 AliceBlue = {0.941176534f, 0.972549081f, 1.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 AntiqueWhite = {0.980392218f, 0.921568692f, 0.843137324f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Aqua = {0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Aquamarine = {0.498039246f, 1.000000000f, 0.831372619f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Azure = {0.941176534f, 1.000000000f, 1.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Beige = {0.960784376f, 0.960784376f, 0.862745166f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Bisque = {1.000000000f, 0.894117713f, 0.768627524f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Black = {0.000000000f, 0.000000000f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 BlanchedAlmond = {1.000000000f, 0.921568692f, 0.803921640f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Blue = {0.000000000f, 0.000000000f, 1.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 BlueViolet = {0.541176498f, 0.168627456f, 0.886274576f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Brown = {0.647058845f, 0.164705887f, 0.164705887f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 BurlyWood = {0.870588303f, 0.721568644f, 0.529411793f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 CadetBlue = {0.372549027f, 0.619607866f, 0.627451003f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Chartreuse = {0.498039246f, 1.000000000f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Chocolate = {0.823529482f, 0.411764741f, 0.117647067f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Coral = {1.000000000f, 0.498039246f, 0.313725501f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 CornflowerBlue = {0.392156899f, 0.584313750f, 0.929411829f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Cornsilk = {1.000000000f, 0.972549081f, 0.862745166f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Crimson = {0.862745166f, 0.078431375f, 0.235294133f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Cyan = {0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkBlue = {0.000000000f, 0.000000000f, 0.545098066f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkCyan = {0.000000000f, 0.545098066f, 0.545098066f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkGoldenrod = {0.721568644f, 0.525490224f, 0.043137256f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkGray = {0.662745118f, 0.662745118f, 0.662745118f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkGreen = {0.000000000f, 0.392156899f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkKhaki = {0.741176486f, 0.717647076f, 0.419607878f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkMagenta = {0.545098066f, 0.000000000f, 0.545098066f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkOliveGreen = {0.333333343f, 0.419607878f, 0.184313729f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkOrange = {1.000000000f, 0.549019635f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkOrchid = {0.600000024f, 0.196078449f, 0.800000072f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkRed = {0.545098066f, 0.000000000f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkSalmon = {0.913725555f, 0.588235319f, 0.478431404f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkSeaGreen = {0.560784340f, 0.737254918f, 0.545098066f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkSlateBlue = {0.282352954f, 0.239215702f, 0.545098066f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkSlateGray = {0.184313729f, 0.309803933f, 0.309803933f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkTurquoise = {0.000000000f, 0.807843208f, 0.819607913f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DarkViolet = {0.580392182f, 0.000000000f, 0.827451050f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DeepPink = {1.000000000f, 0.078431375f, 0.576470613f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DeepSkyBlue = {0.000000000f, 0.749019623f, 1.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DimGray = {0.411764741f, 0.411764741f, 0.411764741f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 DodgerBlue = {0.117647067f, 0.564705908f, 1.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Firebrick = {0.698039234f, 0.133333340f, 0.133333340f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 FloralWhite = {1.000000000f, 0.980392218f, 0.941176534f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 ForestGreen = {0.133333340f, 0.545098066f, 0.133333340f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Fuchsia = {1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Gainsboro = {0.862745166f, 0.862745166f, 0.862745166f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 GhostWhite = {0.972549081f, 0.972549081f, 1.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Gold = {1.000000000f, 0.843137324f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Goldenrod = {0.854902029f, 0.647058845f, 0.125490203f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Gray = {0.501960814f, 0.501960814f, 0.501960814f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Green = {0.000000000f, 0.501960814f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 GreenYellow = {0.678431392f, 1.000000000f, 0.184313729f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Honeydew = {0.941176534f, 1.000000000f, 0.941176534f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 HotPink = {1.000000000f, 0.411764741f, 0.705882370f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 IndianRed = {0.803921640f, 0.360784322f, 0.360784322f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Indigo = {0.294117659f, 0.000000000f, 0.509803951f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Ivory = {1.000000000f, 1.000000000f, 0.941176534f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Khaki = {0.941176534f, 0.901960850f, 0.549019635f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Lavender = {0.901960850f, 0.901960850f, 0.980392218f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LavenderBlush = {1.000000000f, 0.941176534f, 0.960784376f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LawnGreen = {0.486274540f, 0.988235354f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LemonChiffon = {1.000000000f, 0.980392218f, 0.803921640f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightBlue = {0.678431392f, 0.847058892f, 0.901960850f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightCoral = {0.941176534f, 0.501960814f, 0.501960814f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightCyan = {0.878431439f, 1.000000000f, 1.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = {0.980392218f, 0.980392218f, 0.823529482f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightGreen = {0.564705908f, 0.933333397f, 0.564705908f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightGray = {0.827451050f, 0.827451050f, 0.827451050f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightPink = {1.000000000f, 0.713725507f, 0.756862819f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightSalmon = {1.000000000f, 0.627451003f, 0.478431404f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightSeaGreen = {0.125490203f, 0.698039234f, 0.666666687f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightSkyBlue = {0.529411793f, 0.807843208f, 0.980392218f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightSlateGray = {0.466666698f, 0.533333361f, 0.600000024f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightSteelBlue = {0.690196097f, 0.768627524f, 0.870588303f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LightYellow = {1.000000000f, 1.000000000f, 0.878431439f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Lime = {0.000000000f, 1.000000000f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 LimeGreen = {0.196078449f, 0.803921640f, 0.196078449f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Linen = {0.980392218f, 0.941176534f, 0.901960850f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Magenta = {1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Maroon = {0.501960814f, 0.000000000f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 MediumAquamarine = {0.400000036f, 0.803921640f, 0.666666687f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 MediumBlue = {0.000000000f, 0.000000000f, 0.803921640f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 MediumOrchid = {0.729411781f, 0.333333343f, 0.827451050f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 MediumPurple = {0.576470613f, 0.439215720f, 0.858823597f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 MediumSeaGreen = {0.235294133f, 0.701960802f, 0.443137288f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 MediumSlateBlue = {0.482352972f, 0.407843173f, 0.933333397f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 MediumSpringGreen = {0.000000000f, 0.980392218f, 0.603921592f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 MediumTurquoise = {0.282352954f, 0.819607913f, 0.800000072f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 MediumVioletRed = {0.780392230f, 0.082352944f, 0.521568656f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 MidnightBlue = {0.098039225f, 0.098039225f, 0.439215720f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 MintCream = {0.960784376f, 1.000000000f, 0.980392218f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 MistyRose = {1.000000000f, 0.894117713f, 0.882353008f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Moccasin = {1.000000000f, 0.894117713f, 0.709803939f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 NavajoWhite = {1.000000000f, 0.870588303f, 0.678431392f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Navy = {0.000000000f, 0.000000000f, 0.501960814f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 OldLace = {0.992156923f, 0.960784376f, 0.901960850f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Olive = {0.501960814f, 0.501960814f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 OliveDrab = {0.419607878f, 0.556862772f, 0.137254909f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Orange = {1.000000000f, 0.647058845f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 OrangeRed = {1.000000000f, 0.270588249f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Orchid = {0.854902029f, 0.439215720f, 0.839215755f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 PaleGoldenrod = {0.933333397f, 0.909803987f, 0.666666687f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 PaleGreen = {0.596078455f, 0.984313786f, 0.596078455f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 PaleTurquoise = {0.686274529f, 0.933333397f, 0.933333397f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 PaleVioletRed = {0.858823597f, 0.439215720f, 0.576470613f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 PapayaWhip = {1.000000000f, 0.937254965f, 0.835294187f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 PeachPuff = {1.000000000f, 0.854902029f, 0.725490212f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Peru = {0.803921640f, 0.521568656f, 0.247058839f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Pink = {1.000000000f, 0.752941251f, 0.796078503f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Plum = {0.866666734f, 0.627451003f, 0.866666734f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 PowderBlue = {0.690196097f, 0.878431439f, 0.901960850f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Purple = {0.501960814f, 0.000000000f, 0.501960814f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Red = {1.000000000f, 0.000000000f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 RosyBrown = {0.737254918f, 0.560784340f, 0.560784340f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 RoyalBlue = {0.254901975f, 0.411764741f, 0.882353008f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 SaddleBrown = {0.545098066f, 0.270588249f, 0.074509807f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Salmon = {0.980392218f, 0.501960814f, 0.447058856f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 SandyBrown = {0.956862807f, 0.643137276f, 0.376470625f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 SeaGreen = {0.180392161f, 0.545098066f, 0.341176480f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 SeaShell = {1.000000000f, 0.960784376f, 0.933333397f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Sienna = {0.627451003f, 0.321568638f, 0.176470593f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Silver = {0.752941251f, 0.752941251f, 0.752941251f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 SkyBlue = {0.529411793f, 0.807843208f, 0.921568692f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 SlateBlue = {0.415686309f, 0.352941185f, 0.803921640f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 SlateGray = {0.439215720f, 0.501960814f, 0.564705908f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Snow = {1.000000000f, 0.980392218f, 0.980392218f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 SpringGreen = {0.000000000f, 1.000000000f, 0.498039246f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 SteelBlue = {0.274509817f, 0.509803951f, 0.705882370f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Tan = {0.823529482f, 0.705882370f, 0.549019635f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Teal = {0.000000000f, 0.501960814f, 0.501960814f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Thistle = {0.847058892f, 0.749019623f, 0.847058892f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Tomato = {1.000000000f, 0.388235331f, 0.278431386f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Transparent = {0.000000000f, 0.000000000f, 0.000000000f, 0.000000000f}; - XMGLOBALCONST XMVECTORF32 Turquoise = {0.250980407f, 0.878431439f, 0.815686345f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Violet = {0.933333397f, 0.509803951f, 0.933333397f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Wheat = {0.960784376f, 0.870588303f, 0.701960802f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 White = {1.000000000f, 1.000000000f, 1.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 WhiteSmoke = {0.960784376f, 0.960784376f, 0.960784376f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 Yellow = {1.000000000f, 1.000000000f, 0.000000000f, 1.000000000f}; - XMGLOBALCONST XMVECTORF32 YellowGreen = {0.603921592f, 0.803921640f, 0.196078449f, 1.000000000f}; - -}; // namespace Colors - -}; // namespace DirectX - +//------------------------------------------------------------------------------------- +// DirectXColors.h -- C++ Color Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMath.h" + +namespace DirectX +{ + +namespace Colors +{ + // Standard colors (Red/Green/Blue/Alpha) + XMGLOBALCONST XMVECTORF32 AliceBlue = {0.941176534f, 0.972549081f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 AntiqueWhite = {0.980392218f, 0.921568692f, 0.843137324f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Aqua = {0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Aquamarine = {0.498039246f, 1.000000000f, 0.831372619f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Azure = {0.941176534f, 1.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Beige = {0.960784376f, 0.960784376f, 0.862745166f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Bisque = {1.000000000f, 0.894117713f, 0.768627524f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Black = {0.000000000f, 0.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 BlanchedAlmond = {1.000000000f, 0.921568692f, 0.803921640f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Blue = {0.000000000f, 0.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 BlueViolet = {0.541176498f, 0.168627456f, 0.886274576f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Brown = {0.647058845f, 0.164705887f, 0.164705887f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 BurlyWood = {0.870588303f, 0.721568644f, 0.529411793f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 CadetBlue = {0.372549027f, 0.619607866f, 0.627451003f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Chartreuse = {0.498039246f, 1.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Chocolate = {0.823529482f, 0.411764741f, 0.117647067f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Coral = {1.000000000f, 0.498039246f, 0.313725501f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 CornflowerBlue = {0.392156899f, 0.584313750f, 0.929411829f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Cornsilk = {1.000000000f, 0.972549081f, 0.862745166f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Crimson = {0.862745166f, 0.078431375f, 0.235294133f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Cyan = {0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkBlue = {0.000000000f, 0.000000000f, 0.545098066f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkCyan = {0.000000000f, 0.545098066f, 0.545098066f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkGoldenrod = {0.721568644f, 0.525490224f, 0.043137256f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkGray = {0.662745118f, 0.662745118f, 0.662745118f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkGreen = {0.000000000f, 0.392156899f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkKhaki = {0.741176486f, 0.717647076f, 0.419607878f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkMagenta = {0.545098066f, 0.000000000f, 0.545098066f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkOliveGreen = {0.333333343f, 0.419607878f, 0.184313729f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkOrange = {1.000000000f, 0.549019635f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkOrchid = {0.600000024f, 0.196078449f, 0.800000072f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkRed = {0.545098066f, 0.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkSalmon = {0.913725555f, 0.588235319f, 0.478431404f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkSeaGreen = {0.560784340f, 0.737254918f, 0.545098066f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkSlateBlue = {0.282352954f, 0.239215702f, 0.545098066f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkSlateGray = {0.184313729f, 0.309803933f, 0.309803933f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkTurquoise = {0.000000000f, 0.807843208f, 0.819607913f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkViolet = {0.580392182f, 0.000000000f, 0.827451050f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DeepPink = {1.000000000f, 0.078431375f, 0.576470613f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DeepSkyBlue = {0.000000000f, 0.749019623f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DimGray = {0.411764741f, 0.411764741f, 0.411764741f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DodgerBlue = {0.117647067f, 0.564705908f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Firebrick = {0.698039234f, 0.133333340f, 0.133333340f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 FloralWhite = {1.000000000f, 0.980392218f, 0.941176534f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 ForestGreen = {0.133333340f, 0.545098066f, 0.133333340f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Fuchsia = {1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Gainsboro = {0.862745166f, 0.862745166f, 0.862745166f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 GhostWhite = {0.972549081f, 0.972549081f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Gold = {1.000000000f, 0.843137324f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Goldenrod = {0.854902029f, 0.647058845f, 0.125490203f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Gray = {0.501960814f, 0.501960814f, 0.501960814f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Green = {0.000000000f, 0.501960814f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 GreenYellow = {0.678431392f, 1.000000000f, 0.184313729f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Honeydew = {0.941176534f, 1.000000000f, 0.941176534f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 HotPink = {1.000000000f, 0.411764741f, 0.705882370f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 IndianRed = {0.803921640f, 0.360784322f, 0.360784322f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Indigo = {0.294117659f, 0.000000000f, 0.509803951f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Ivory = {1.000000000f, 1.000000000f, 0.941176534f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Khaki = {0.941176534f, 0.901960850f, 0.549019635f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Lavender = {0.901960850f, 0.901960850f, 0.980392218f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LavenderBlush = {1.000000000f, 0.941176534f, 0.960784376f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LawnGreen = {0.486274540f, 0.988235354f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LemonChiffon = {1.000000000f, 0.980392218f, 0.803921640f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightBlue = {0.678431392f, 0.847058892f, 0.901960850f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightCoral = {0.941176534f, 0.501960814f, 0.501960814f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightCyan = {0.878431439f, 1.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = {0.980392218f, 0.980392218f, 0.823529482f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightGreen = {0.564705908f, 0.933333397f, 0.564705908f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightGray = {0.827451050f, 0.827451050f, 0.827451050f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightPink = {1.000000000f, 0.713725507f, 0.756862819f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightSalmon = {1.000000000f, 0.627451003f, 0.478431404f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightSeaGreen = {0.125490203f, 0.698039234f, 0.666666687f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightSkyBlue = {0.529411793f, 0.807843208f, 0.980392218f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightSlateGray = {0.466666698f, 0.533333361f, 0.600000024f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightSteelBlue = {0.690196097f, 0.768627524f, 0.870588303f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightYellow = {1.000000000f, 1.000000000f, 0.878431439f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Lime = {0.000000000f, 1.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LimeGreen = {0.196078449f, 0.803921640f, 0.196078449f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Linen = {0.980392218f, 0.941176534f, 0.901960850f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Magenta = {1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Maroon = {0.501960814f, 0.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumAquamarine = {0.400000036f, 0.803921640f, 0.666666687f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumBlue = {0.000000000f, 0.000000000f, 0.803921640f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumOrchid = {0.729411781f, 0.333333343f, 0.827451050f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumPurple = {0.576470613f, 0.439215720f, 0.858823597f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumSeaGreen = {0.235294133f, 0.701960802f, 0.443137288f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumSlateBlue = {0.482352972f, 0.407843173f, 0.933333397f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumSpringGreen = {0.000000000f, 0.980392218f, 0.603921592f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumTurquoise = {0.282352954f, 0.819607913f, 0.800000072f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumVioletRed = {0.780392230f, 0.082352944f, 0.521568656f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MidnightBlue = {0.098039225f, 0.098039225f, 0.439215720f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MintCream = {0.960784376f, 1.000000000f, 0.980392218f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MistyRose = {1.000000000f, 0.894117713f, 0.882353008f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Moccasin = {1.000000000f, 0.894117713f, 0.709803939f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 NavajoWhite = {1.000000000f, 0.870588303f, 0.678431392f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Navy = {0.000000000f, 0.000000000f, 0.501960814f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 OldLace = {0.992156923f, 0.960784376f, 0.901960850f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Olive = {0.501960814f, 0.501960814f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 OliveDrab = {0.419607878f, 0.556862772f, 0.137254909f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Orange = {1.000000000f, 0.647058845f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 OrangeRed = {1.000000000f, 0.270588249f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Orchid = {0.854902029f, 0.439215720f, 0.839215755f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PaleGoldenrod = {0.933333397f, 0.909803987f, 0.666666687f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PaleGreen = {0.596078455f, 0.984313786f, 0.596078455f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PaleTurquoise = {0.686274529f, 0.933333397f, 0.933333397f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PaleVioletRed = {0.858823597f, 0.439215720f, 0.576470613f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PapayaWhip = {1.000000000f, 0.937254965f, 0.835294187f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PeachPuff = {1.000000000f, 0.854902029f, 0.725490212f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Peru = {0.803921640f, 0.521568656f, 0.247058839f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Pink = {1.000000000f, 0.752941251f, 0.796078503f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Plum = {0.866666734f, 0.627451003f, 0.866666734f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PowderBlue = {0.690196097f, 0.878431439f, 0.901960850f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Purple = {0.501960814f, 0.000000000f, 0.501960814f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Red = {1.000000000f, 0.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 RosyBrown = {0.737254918f, 0.560784340f, 0.560784340f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 RoyalBlue = {0.254901975f, 0.411764741f, 0.882353008f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SaddleBrown = {0.545098066f, 0.270588249f, 0.074509807f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Salmon = {0.980392218f, 0.501960814f, 0.447058856f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SandyBrown = {0.956862807f, 0.643137276f, 0.376470625f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SeaGreen = {0.180392161f, 0.545098066f, 0.341176480f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SeaShell = {1.000000000f, 0.960784376f, 0.933333397f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Sienna = {0.627451003f, 0.321568638f, 0.176470593f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Silver = {0.752941251f, 0.752941251f, 0.752941251f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SkyBlue = {0.529411793f, 0.807843208f, 0.921568692f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SlateBlue = {0.415686309f, 0.352941185f, 0.803921640f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SlateGray = {0.439215720f, 0.501960814f, 0.564705908f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Snow = {1.000000000f, 0.980392218f, 0.980392218f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SpringGreen = {0.000000000f, 1.000000000f, 0.498039246f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SteelBlue = {0.274509817f, 0.509803951f, 0.705882370f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Tan = {0.823529482f, 0.705882370f, 0.549019635f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Teal = {0.000000000f, 0.501960814f, 0.501960814f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Thistle = {0.847058892f, 0.749019623f, 0.847058892f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Tomato = {1.000000000f, 0.388235331f, 0.278431386f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Transparent = {0.000000000f, 0.000000000f, 0.000000000f, 0.000000000f}; + XMGLOBALCONST XMVECTORF32 Turquoise = {0.250980407f, 0.878431439f, 0.815686345f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Violet = {0.933333397f, 0.509803951f, 0.933333397f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Wheat = {0.960784376f, 0.870588303f, 0.701960802f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 White = {1.000000000f, 1.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 WhiteSmoke = {0.960784376f, 0.960784376f, 0.960784376f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Yellow = {1.000000000f, 1.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 YellowGreen = {0.603921592f, 0.803921640f, 0.196078449f, 1.000000000f}; + +}; // namespace Colors + +}; // namespace DirectX + diff --git a/Inc/DirectXMath.h b/Inc/DirectXMath.h index a9a0d1b..36b6c0d 100644 --- a/Inc/DirectXMath.h +++ b/Inc/DirectXMath.h @@ -1,1992 +1,1992 @@ -//------------------------------------------------------------------------------------- -// DirectXMath.h -- SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#pragma once - -#ifndef __cplusplus -#error DirectX Math requires C++ -#endif - -#define DIRECTX_MATH_VERSION 309 - -#if defined(_MSC_VER) && (_MSC_VER < 1800) -#error DirectX Math Visual C++ 2013 or later. -#endif - -#if defined(_MSC_VER) && !defined(_M_ARM) && !defined(_M_ARM64) && (!_MANAGED) && (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) && !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_) -#define _XM_VECTORCALL_ 1 -#endif - -#if _XM_VECTORCALL_ -#define XM_CALLCONV __vectorcall -#else -#define XM_CALLCONV __fastcall -#endif - -#if defined(_MSC_VER) && (_MSC_VER < 1800) -#define XM_CTOR_DEFAULT {} -#else -#define XM_CTOR_DEFAULT =default; -#endif - -#if defined(_MSC_VER) && (_MSC_VER < 1900) -#define XM_CONSTEXPR const -#else -#define XM_CONSTEXPR constexpr -#endif - -#ifndef XM_DEPRECATED -#define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version.")) -#endif - -#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_) -#define _XM_F16C_INTRINSICS_ -#endif - -#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_) -#define _XM_AVX_INTRINSICS_ -#endif - -#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_) -#define _XM_AVX_INTRINSICS_ -#endif - -#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_) -#define _XM_SSE4_INTRINSICS_ -#endif - -#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE3_INTRINSICS_) -#define _XM_SSE3_INTRINSICS_ -#endif - -#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) -#define _XM_SSE_INTRINSICS_ -#endif - -#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) -#if defined(_M_IX86) || defined(_M_X64) -#define _XM_SSE_INTRINSICS_ -#elif defined(_M_ARM) || defined(_M_ARM64) -#define _XM_ARM_NEON_INTRINSICS_ -#elif !defined(_XM_NO_INTRINSICS_) -#error DirectX Math does not support this target -#endif -#endif // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ - -#pragma warning(push) -#pragma warning(disable:4514 4820) -// C4514/4820: Off by default noise -#include -#include -#include -#pragma warning(pop) - -#ifndef _XM_NO_INTRINSICS_ -#pragma warning(push) -#pragma warning(disable : 4987) -// C4987: Off by default noise -#include -#pragma warning(pop) - -#ifdef _XM_SSE_INTRINSICS_ -#include -#include - -#ifdef _XM_SSE3_INTRINSICS_ -#include -#endif - -#ifdef _XM_SSE4_INTRINSICS_ -#include -#endif - -#ifdef _XM_AVX_INTRINSICS_ -#include -#endif - -#elif defined(_XM_ARM_NEON_INTRINSICS_) -#ifdef _M_ARM64 -#include -#else -#include -#endif -#endif -#endif // !_XM_NO_INTRINSICS_ - -#include -#include - -#ifndef _XM_NO_ROUNDF_ -#ifdef _MSC_VER -#include -#if defined(_CPPLIB_VER) && ( _CPPLIB_VER < 610 ) -#define _XM_NO_ROUNDF_ -#endif -#endif -#endif - -#pragma warning(push) -#pragma warning(disable : 4005 4668) -// C4005/4668: Old header issue -#include -#pragma warning(pop) - -/**************************************************************************** - * - * Conditional intrinsics - * - ****************************************************************************/ - -#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - -#if defined(_XM_NO_MOVNT_) -#define XM_STREAM_PS( p, a ) _mm_store_ps( p, a ) -#define XM_SFENCE() -#else -#define XM_STREAM_PS( p, a ) _mm_stream_ps( p, a ) -#define XM_SFENCE() _mm_sfence() -#endif - -#if defined(_XM_AVX_INTRINSICS_) -#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c ) -#else -#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c ) -#endif - -#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ - -namespace DirectX -{ - -/**************************************************************************** - * - * Constant definitions - * - ****************************************************************************/ - -#if defined(__XNAMATH_H__) && defined(XM_PI) -#undef XM_PI -#undef XM_2PI -#undef XM_1DIVPI -#undef XM_1DIV2PI -#undef XM_PIDIV2 -#undef XM_PIDIV4 -#undef XM_SELECT_0 -#undef XM_SELECT_1 -#undef XM_PERMUTE_0X -#undef XM_PERMUTE_0Y -#undef XM_PERMUTE_0Z -#undef XM_PERMUTE_0W -#undef XM_PERMUTE_1X -#undef XM_PERMUTE_1Y -#undef XM_PERMUTE_1Z -#undef XM_PERMUTE_1W -#undef XM_CRMASK_CR6 -#undef XM_CRMASK_CR6TRUE -#undef XM_CRMASK_CR6FALSE -#undef XM_CRMASK_CR6BOUNDS -#undef XM_CACHE_LINE_SIZE -#endif - -XM_CONSTEXPR float XM_PI = 3.141592654f; -XM_CONSTEXPR float XM_2PI = 6.283185307f; -XM_CONSTEXPR float XM_1DIVPI = 0.318309886f; -XM_CONSTEXPR float XM_1DIV2PI = 0.159154943f; -XM_CONSTEXPR float XM_PIDIV2 = 1.570796327f; -XM_CONSTEXPR float XM_PIDIV4 = 0.785398163f; - -XM_CONSTEXPR uint32_t XM_SELECT_0 = 0x00000000; -XM_CONSTEXPR uint32_t XM_SELECT_1 = 0xFFFFFFFF; - -XM_CONSTEXPR uint32_t XM_PERMUTE_0X = 0; -XM_CONSTEXPR uint32_t XM_PERMUTE_0Y = 1; -XM_CONSTEXPR uint32_t XM_PERMUTE_0Z = 2; -XM_CONSTEXPR uint32_t XM_PERMUTE_0W = 3; -XM_CONSTEXPR uint32_t XM_PERMUTE_1X = 4; -XM_CONSTEXPR uint32_t XM_PERMUTE_1Y = 5; -XM_CONSTEXPR uint32_t XM_PERMUTE_1Z = 6; -XM_CONSTEXPR uint32_t XM_PERMUTE_1W = 7; - -XM_CONSTEXPR uint32_t XM_SWIZZLE_X = 0; -XM_CONSTEXPR uint32_t XM_SWIZZLE_Y = 1; -XM_CONSTEXPR uint32_t XM_SWIZZLE_Z = 2; -XM_CONSTEXPR uint32_t XM_SWIZZLE_W = 3; - -XM_CONSTEXPR uint32_t XM_CRMASK_CR6 = 0x000000F0; -XM_CONSTEXPR uint32_t XM_CRMASK_CR6TRUE = 0x00000080; -XM_CONSTEXPR uint32_t XM_CRMASK_CR6FALSE = 0x00000020; -XM_CONSTEXPR uint32_t XM_CRMASK_CR6BOUNDS = XM_CRMASK_CR6FALSE; - -XM_CONSTEXPR size_t XM_CACHE_LINE_SIZE = 64; - - -/**************************************************************************** - * - * Macros - * - ****************************************************************************/ - -#if defined(__XNAMATH_H__) && defined(XMComparisonAllTrue) -#undef XMComparisonAllTrue -#undef XMComparisonAnyTrue -#undef XMComparisonAllFalse -#undef XMComparisonAnyFalse -#undef XMComparisonMixed -#undef XMComparisonAllInBounds -#undef XMComparisonAnyOutOfBounds -#endif - -// Unit conversion - -inline XM_CONSTEXPR float XMConvertToRadians(float fDegrees) { return fDegrees * (XM_PI / 180.0f); } -inline XM_CONSTEXPR float XMConvertToDegrees(float fRadians) { return fRadians * (180.0f / XM_PI); } - -// Condition register evaluation proceeding a recording (R) comparison - -inline bool XMComparisonAllTrue(uint32_t CR) { return (((CR) & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE); } -inline bool XMComparisonAnyTrue(uint32_t CR) { return (((CR) & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE); } -inline bool XMComparisonAllFalse(uint32_t CR) { return (((CR) & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE); } -inline bool XMComparisonAnyFalse(uint32_t CR) { return (((CR) & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE); } -inline bool XMComparisonMixed(uint32_t CR) { return (((CR) & XM_CRMASK_CR6) == 0); } -inline bool XMComparisonAllInBounds(uint32_t CR) { return (((CR) & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS); } -inline bool XMComparisonAnyOutOfBounds(uint32_t CR) { return (((CR) & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS); } - - -/**************************************************************************** - * - * Data types - * - ****************************************************************************/ - -#pragma warning(push) -#pragma warning(disable:4068 4201 4365 4324 4820) -// C4068: ignore unknown pragmas -// C4201: nonstandard extension used : nameless struct/union -// C4365: Off by default noise -// C4324/4820: padding warnings - -#pragma prefast(push) -#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") - -//------------------------------------------------------------------------------ -#if defined(_XM_NO_INTRINSICS_) -struct __vector4 -{ - union - { - float vector4_f32[4]; - uint32_t vector4_u32[4]; - }; -}; -#endif // _XM_NO_INTRINSICS_ - -//------------------------------------------------------------------------------ -// Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte -// boundary and mapped to hardware vector registers -#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) -typedef __m128 XMVECTOR; -#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) -typedef float32x4_t XMVECTOR; -#else -typedef __vector4 XMVECTOR; -#endif - -// Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, ARM64, and vector call; by reference otherwise -#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_) -typedef const XMVECTOR FXMVECTOR; -#else -typedef const XMVECTOR& FXMVECTOR; -#endif - -// Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and x64 vector call; by reference otherwise -#if ( defined(_M_ARM) || defined(_M_ARM64) || (_XM_VECTORCALL_ && !defined(_M_IX86) ) ) && !defined(_XM_NO_INTRINSICS_) -typedef const XMVECTOR GXMVECTOR; -#else -typedef const XMVECTOR& GXMVECTOR; -#endif - -// Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and vector call; by reference otherwise -#if ( defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_) -typedef const XMVECTOR HXMVECTOR; -#else -typedef const XMVECTOR& HXMVECTOR; -#endif - -// Fix-up for (7th+) XMVECTOR parameters to pass by reference -typedef const XMVECTOR& CXMVECTOR; - -//------------------------------------------------------------------------------ -// Conversion types for constants -__declspec(align(16)) struct XMVECTORF32 -{ - union - { - float f[4]; - XMVECTOR v; - }; - - inline operator XMVECTOR() const { return v; } - inline operator const float*() const { return f; } -#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) - inline operator __m128i() const { return _mm_castps_si128(v); } - inline operator __m128d() const { return _mm_castps_pd(v); } -#endif -}; - -__declspec(align(16)) struct XMVECTORI32 -{ - union - { - int32_t i[4]; - XMVECTOR v; - }; - - inline operator XMVECTOR() const { return v; } -#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) - inline operator __m128i() const { return _mm_castps_si128(v); } - inline operator __m128d() const { return _mm_castps_pd(v); } -#endif -}; - -__declspec(align(16)) struct XMVECTORU8 -{ - union - { - uint8_t u[16]; - XMVECTOR v; - }; - - inline operator XMVECTOR() const { return v; } -#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) - inline operator __m128i() const { return _mm_castps_si128(v); } - inline operator __m128d() const { return _mm_castps_pd(v); } -#endif -}; - -__declspec(align(16)) struct XMVECTORU32 -{ - union - { - uint32_t u[4]; - XMVECTOR v; - }; - - inline operator XMVECTOR() const { return v; } -#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) - inline operator __m128i() const { return _mm_castps_si128(v); } - inline operator __m128d() const { return _mm_castps_pd(v); } -#endif -}; - -//------------------------------------------------------------------------------ -// Vector operators -XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V); -XMVECTOR XM_CALLCONV operator- (FXMVECTOR V); - -XMVECTOR& XM_CALLCONV operator+= (XMVECTOR& V1, FXMVECTOR V2); -XMVECTOR& XM_CALLCONV operator-= (XMVECTOR& V1, FXMVECTOR V2); -XMVECTOR& XM_CALLCONV operator*= (XMVECTOR& V1, FXMVECTOR V2); -XMVECTOR& XM_CALLCONV operator/= (XMVECTOR& V1, FXMVECTOR V2); - -XMVECTOR& operator*= (XMVECTOR& V, float S); -XMVECTOR& operator/= (XMVECTOR& V, float S); - -XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV operator- (FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV operator* (FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV operator* (FXMVECTOR V, float S); -XMVECTOR XM_CALLCONV operator* (float S, FXMVECTOR V); -XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V, float S); - -//------------------------------------------------------------------------------ -// Matrix type: Sixteen 32 bit floating point components aligned on a -// 16 byte boundary and mapped to four hardware vector registers - -struct XMMATRIX; - -// Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector call; by reference otherwise -#if ( defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_) -typedef const XMMATRIX FXMMATRIX; -#else -typedef const XMMATRIX& FXMMATRIX; -#endif - -// Fix-up for (2nd+) XMMATRIX parameters to pass by reference -typedef const XMMATRIX& CXMMATRIX; - -#ifdef _XM_NO_INTRINSICS_ -struct XMMATRIX -#else -__declspec(align(16)) struct XMMATRIX -#endif -{ -#ifdef _XM_NO_INTRINSICS_ - union - { - XMVECTOR r[4]; - struct - { - float _11, _12, _13, _14; - float _21, _22, _23, _24; - float _31, _32, _33, _34; - float _41, _42, _43, _44; - }; - float m[4][4]; - }; -#else - XMVECTOR r[4]; -#endif - - XMMATRIX() XM_CTOR_DEFAULT -#if defined(_MSC_VER) && _MSC_VER >= 1900 - constexpr XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) : r{ R0,R1,R2,R3 } {} -#else - XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) { r[0] = R0; r[1] = R1; r[2] = R2; r[3] = R3; } -#endif - XMMATRIX(float m00, float m01, float m02, float m03, - float m10, float m11, float m12, float m13, - float m20, float m21, float m22, float m23, - float m30, float m31, float m32, float m33); - explicit XMMATRIX(_In_reads_(16) const float *pArray); - -#ifdef _XM_NO_INTRINSICS_ - float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } - float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } -#endif - - XMMATRIX& operator= (const XMMATRIX& M) { r[0] = M.r[0]; r[1] = M.r[1]; r[2] = M.r[2]; r[3] = M.r[3]; return *this; } - - XMMATRIX operator+ () const { return *this; } - XMMATRIX operator- () const; - - XMMATRIX& XM_CALLCONV operator+= (FXMMATRIX M); - XMMATRIX& XM_CALLCONV operator-= (FXMMATRIX M); - XMMATRIX& XM_CALLCONV operator*= (FXMMATRIX M); - XMMATRIX& operator*= (float S); - XMMATRIX& operator/= (float S); - - XMMATRIX XM_CALLCONV operator+ (FXMMATRIX M) const; - XMMATRIX XM_CALLCONV operator- (FXMMATRIX M) const; - XMMATRIX XM_CALLCONV operator* (FXMMATRIX M) const; - XMMATRIX operator* (float S) const; - XMMATRIX operator/ (float S) const; - - friend XMMATRIX XM_CALLCONV operator* (float S, FXMMATRIX M); -}; - -//------------------------------------------------------------------------------ -// 2D Vector; 32 bit floating point components -struct XMFLOAT2 -{ - float x; - float y; - - XMFLOAT2() XM_CTOR_DEFAULT - XM_CONSTEXPR XMFLOAT2(float _x, float _y) : x(_x), y(_y) {} - explicit XMFLOAT2(_In_reads_(2) const float *pArray) : x(pArray[0]), y(pArray[1]) {} - - XMFLOAT2& operator= (const XMFLOAT2& Float2) { x = Float2.x; y = Float2.y; return *this; } -}; - -// 2D Vector; 32 bit floating point components aligned on a 16 byte boundary -__declspec(align(16)) struct XMFLOAT2A : public XMFLOAT2 -{ - XMFLOAT2A() XM_CTOR_DEFAULT - XM_CONSTEXPR XMFLOAT2A(float _x, float _y) : XMFLOAT2(_x, _y) {} - explicit XMFLOAT2A(_In_reads_(2) const float *pArray) : XMFLOAT2(pArray) {} - - XMFLOAT2A& operator= (const XMFLOAT2A& Float2) { x = Float2.x; y = Float2.y; return *this; } -}; - -//------------------------------------------------------------------------------ -// 2D Vector; 32 bit signed integer components -struct XMINT2 -{ - int32_t x; - int32_t y; - - XMINT2() XM_CTOR_DEFAULT - XM_CONSTEXPR XMINT2(int32_t _x, int32_t _y) : x(_x), y(_y) {} - explicit XMINT2(_In_reads_(2) const int32_t *pArray) : x(pArray[0]), y(pArray[1]) {} - - XMINT2& operator= (const XMINT2& Int2) { x = Int2.x; y = Int2.y; return *this; } -}; - -// 2D Vector; 32 bit unsigned integer components -struct XMUINT2 -{ - uint32_t x; - uint32_t y; - - XMUINT2() XM_CTOR_DEFAULT - XM_CONSTEXPR XMUINT2(uint32_t _x, uint32_t _y) : x(_x), y(_y) {} - explicit XMUINT2(_In_reads_(2) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]) {} - - XMUINT2& operator= (const XMUINT2& UInt2) { x = UInt2.x; y = UInt2.y; return *this; } -}; - -//------------------------------------------------------------------------------ -// 3D Vector; 32 bit floating point components -struct XMFLOAT3 -{ - float x; - float y; - float z; - - XMFLOAT3() XM_CTOR_DEFAULT - XM_CONSTEXPR XMFLOAT3(float _x, float _y, float _z) : x(_x), y(_y), z(_z) {} - explicit XMFLOAT3(_In_reads_(3) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} - - XMFLOAT3& operator= (const XMFLOAT3& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; } -}; - -// 3D Vector; 32 bit floating point components aligned on a 16 byte boundary -__declspec(align(16)) struct XMFLOAT3A : public XMFLOAT3 -{ - XMFLOAT3A() XM_CTOR_DEFAULT - XM_CONSTEXPR XMFLOAT3A(float _x, float _y, float _z) : XMFLOAT3(_x, _y, _z) {} - explicit XMFLOAT3A(_In_reads_(3) const float *pArray) : XMFLOAT3(pArray) {} - - XMFLOAT3A& operator= (const XMFLOAT3A& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; } -}; - -//------------------------------------------------------------------------------ -// 3D Vector; 32 bit signed integer components -struct XMINT3 -{ - int32_t x; - int32_t y; - int32_t z; - - XMINT3() XM_CTOR_DEFAULT - XM_CONSTEXPR XMINT3(int32_t _x, int32_t _y, int32_t _z) : x(_x), y(_y), z(_z) {} - explicit XMINT3(_In_reads_(3) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} - - XMINT3& operator= (const XMINT3& i3) { x = i3.x; y = i3.y; z = i3.z; return *this; } -}; - -// 3D Vector; 32 bit unsigned integer components -struct XMUINT3 -{ - uint32_t x; - uint32_t y; - uint32_t z; - - XMUINT3() XM_CTOR_DEFAULT - XM_CONSTEXPR XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) : x(_x), y(_y), z(_z) {} - explicit XMUINT3(_In_reads_(3) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} - - XMUINT3& operator= (const XMUINT3& u3) { x = u3.x; y = u3.y; z = u3.z; return *this; } -}; - -//------------------------------------------------------------------------------ -// 4D Vector; 32 bit floating point components -struct XMFLOAT4 -{ - float x; - float y; - float z; - float w; - - XMFLOAT4() XM_CTOR_DEFAULT - XM_CONSTEXPR XMFLOAT4(float _x, float _y, float _z, float _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XMFLOAT4(_In_reads_(4) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - - XMFLOAT4& operator= (const XMFLOAT4& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; } -}; - -// 4D Vector; 32 bit floating point components aligned on a 16 byte boundary -__declspec(align(16)) struct XMFLOAT4A : public XMFLOAT4 -{ - XMFLOAT4A() XM_CTOR_DEFAULT - XM_CONSTEXPR XMFLOAT4A(float _x, float _y, float _z, float _w) : XMFLOAT4(_x, _y, _z, _w) {} - explicit XMFLOAT4A(_In_reads_(4) const float *pArray) : XMFLOAT4(pArray) {} - - XMFLOAT4A& operator= (const XMFLOAT4A& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; } -}; - -//------------------------------------------------------------------------------ -// 4D Vector; 32 bit signed integer components -struct XMINT4 -{ - int32_t x; - int32_t y; - int32_t z; - int32_t w; - - XMINT4() XM_CTOR_DEFAULT - XM_CONSTEXPR XMINT4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XMINT4(_In_reads_(4) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - - XMINT4& operator= (const XMINT4& Int4) { x = Int4.x; y = Int4.y; z = Int4.z; w = Int4.w; return *this; } -}; - -// 4D Vector; 32 bit unsigned integer components -struct XMUINT4 -{ - uint32_t x; - uint32_t y; - uint32_t z; - uint32_t w; - - XMUINT4() XM_CTOR_DEFAULT - XM_CONSTEXPR XMUINT4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XMUINT4(_In_reads_(4) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - - XMUINT4& operator= (const XMUINT4& UInt4) { x = UInt4.x; y = UInt4.y; z = UInt4.z; w = UInt4.w; return *this; } -}; - -//------------------------------------------------------------------------------ -// 3x3 Matrix: 32 bit floating point components -struct XMFLOAT3X3 -{ - union - { - struct - { - float _11, _12, _13; - float _21, _22, _23; - float _31, _32, _33; - }; - float m[3][3]; - }; - - XMFLOAT3X3() XM_CTOR_DEFAULT - XM_CONSTEXPR XMFLOAT3X3(float m00, float m01, float m02, - float m10, float m11, float m12, - float m20, float m21, float m22) - : _11(m00), _12(m01), _13(m02), - _21(m10), _22(m11), _23(m12), - _31(m20), _32(m21), _33(m22) {} - explicit XMFLOAT3X3(_In_reads_(9) const float *pArray); - - float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } - float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } - - XMFLOAT3X3& operator= (const XMFLOAT3X3& Float3x3); -}; - -//------------------------------------------------------------------------------ -// 4x3 Matrix: 32 bit floating point components -struct XMFLOAT4X3 -{ - union - { - struct - { - float _11, _12, _13; - float _21, _22, _23; - float _31, _32, _33; - float _41, _42, _43; - }; - float m[4][3]; - }; - - XMFLOAT4X3() XM_CTOR_DEFAULT - XM_CONSTEXPR XMFLOAT4X3(float m00, float m01, float m02, - float m10, float m11, float m12, - float m20, float m21, float m22, - float m30, float m31, float m32) - : _11(m00), _12(m01), _13(m02), - _21(m10), _22(m11), _23(m12), - _31(m20), _32(m21), _33(m22), - _41(m30), _42(m31), _43(m32) {} - explicit XMFLOAT4X3(_In_reads_(12) const float *pArray); - - float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } - float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } - - XMFLOAT4X3& operator= (const XMFLOAT4X3& Float4x3); - -}; - -// 4x3 Matrix: 32 bit floating point components aligned on a 16 byte boundary -__declspec(align(16)) struct XMFLOAT4X3A : public XMFLOAT4X3 -{ - XMFLOAT4X3A() XM_CTOR_DEFAULT - XM_CONSTEXPR XMFLOAT4X3A(float m00, float m01, float m02, - float m10, float m11, float m12, - float m20, float m21, float m22, - float m30, float m31, float m32) : - XMFLOAT4X3(m00,m01,m02,m10,m11,m12,m20,m21,m22,m30,m31,m32) {} - explicit XMFLOAT4X3A(_In_reads_(12) const float *pArray) : XMFLOAT4X3(pArray) {} - - float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } - float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } - - XMFLOAT4X3A& operator= (const XMFLOAT4X3A& Float4x3); -}; - -//------------------------------------------------------------------------------ -// 4x4 Matrix: 32 bit floating point components -struct XMFLOAT4X4 -{ - union - { - struct - { - float _11, _12, _13, _14; - float _21, _22, _23, _24; - float _31, _32, _33, _34; - float _41, _42, _43, _44; - }; - float m[4][4]; - }; - - XMFLOAT4X4() XM_CTOR_DEFAULT - XM_CONSTEXPR XMFLOAT4X4(float m00, float m01, float m02, float m03, - float m10, float m11, float m12, float m13, - float m20, float m21, float m22, float m23, - float m30, float m31, float m32, float m33) - : _11(m00), _12(m01), _13(m02), _14(m03), - _21(m10), _22(m11), _23(m12), _24(m13), - _31(m20), _32(m21), _33(m22), _34(m23), - _41(m30), _42(m31), _43(m32), _44(m33) {} - explicit XMFLOAT4X4(_In_reads_(16) const float *pArray); - - float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } - float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } - - XMFLOAT4X4& operator= (const XMFLOAT4X4& Float4x4); -}; - -// 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary -__declspec(align(16)) struct XMFLOAT4X4A : public XMFLOAT4X4 -{ - XMFLOAT4X4A() XM_CTOR_DEFAULT - XM_CONSTEXPR XMFLOAT4X4A(float m00, float m01, float m02, float m03, - float m10, float m11, float m12, float m13, - float m20, float m21, float m22, float m23, - float m30, float m31, float m32, float m33) - : XMFLOAT4X4(m00,m01,m02,m03,m10,m11,m12,m13,m20,m21,m22,m23,m30,m31,m32,m33) {} - explicit XMFLOAT4X4A(_In_reads_(16) const float *pArray) : XMFLOAT4X4(pArray) {} - - float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } - float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } - - XMFLOAT4X4A& operator= (const XMFLOAT4X4A& Float4x4); -}; - -//////////////////////////////////////////////////////////////////////////////// - -#pragma prefast(pop) -#pragma warning(pop) - -/**************************************************************************** - * - * Data conversion operations - * - ****************************************************************************/ - -XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat(FXMVECTOR VInt, uint32_t DivExponent); -XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt(FXMVECTOR VFloat, uint32_t MulExponent); -XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat(FXMVECTOR VUInt, uint32_t DivExponent); -XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt(FXMVECTOR VFloat, uint32_t MulExponent); - -#if defined(__XNAMATH_H__) && defined(XMVectorSetBinaryConstant) -#undef XMVectorSetBinaryConstant -#undef XMVectorSplatConstant -#undef XMVectorSplatConstantInt -#endif - -XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3); -XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent); -XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant); - -/**************************************************************************** - * - * Load operations - * - ****************************************************************************/ - -XMVECTOR XM_CALLCONV XMLoadInt(_In_ const uint32_t* pSource); -XMVECTOR XM_CALLCONV XMLoadFloat(_In_ const float* pSource); - -XMVECTOR XM_CALLCONV XMLoadInt2(_In_reads_(2) const uint32_t* pSource); -XMVECTOR XM_CALLCONV XMLoadInt2A(_In_reads_(2) const uint32_t* PSource); -XMVECTOR XM_CALLCONV XMLoadFloat2(_In_ const XMFLOAT2* pSource); -XMVECTOR XM_CALLCONV XMLoadFloat2A(_In_ const XMFLOAT2A* pSource); -XMVECTOR XM_CALLCONV XMLoadSInt2(_In_ const XMINT2* pSource); -XMVECTOR XM_CALLCONV XMLoadUInt2(_In_ const XMUINT2* pSource); - -XMVECTOR XM_CALLCONV XMLoadInt3(_In_reads_(3) const uint32_t* pSource); -XMVECTOR XM_CALLCONV XMLoadInt3A(_In_reads_(3) const uint32_t* pSource); -XMVECTOR XM_CALLCONV XMLoadFloat3(_In_ const XMFLOAT3* pSource); -XMVECTOR XM_CALLCONV XMLoadFloat3A(_In_ const XMFLOAT3A* pSource); -XMVECTOR XM_CALLCONV XMLoadSInt3(_In_ const XMINT3* pSource); -XMVECTOR XM_CALLCONV XMLoadUInt3(_In_ const XMUINT3* pSource); - -XMVECTOR XM_CALLCONV XMLoadInt4(_In_reads_(4) const uint32_t* pSource); -XMVECTOR XM_CALLCONV XMLoadInt4A(_In_reads_(4) const uint32_t* pSource); -XMVECTOR XM_CALLCONV XMLoadFloat4(_In_ const XMFLOAT4* pSource); -XMVECTOR XM_CALLCONV XMLoadFloat4A(_In_ const XMFLOAT4A* pSource); -XMVECTOR XM_CALLCONV XMLoadSInt4(_In_ const XMINT4* pSource); -XMVECTOR XM_CALLCONV XMLoadUInt4(_In_ const XMUINT4* pSource); - -XMMATRIX XM_CALLCONV XMLoadFloat3x3(_In_ const XMFLOAT3X3* pSource); -XMMATRIX XM_CALLCONV XMLoadFloat4x3(_In_ const XMFLOAT4X3* pSource); -XMMATRIX XM_CALLCONV XMLoadFloat4x3A(_In_ const XMFLOAT4X3A* pSource); -XMMATRIX XM_CALLCONV XMLoadFloat4x4(_In_ const XMFLOAT4X4* pSource); -XMMATRIX XM_CALLCONV XMLoadFloat4x4A(_In_ const XMFLOAT4X4A* pSource); - -/**************************************************************************** - * - * Store operations - * - ****************************************************************************/ - -void XM_CALLCONV XMStoreInt(_Out_ uint32_t* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreFloat(_Out_ float* pDestination, _In_ FXMVECTOR V); - -void XM_CALLCONV XMStoreInt2(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreInt2A(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreFloat2(_Out_ XMFLOAT2* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreSInt2(_Out_ XMINT2* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUInt2(_Out_ XMUINT2* pDestination, _In_ FXMVECTOR V); - -void XM_CALLCONV XMStoreInt3(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreInt3A(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreFloat3(_Out_ XMFLOAT3* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreSInt3(_Out_ XMINT3* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUInt3(_Out_ XMUINT3* pDestination, _In_ FXMVECTOR V); - -void XM_CALLCONV XMStoreInt4(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreInt4A(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreFloat4(_Out_ XMFLOAT4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreSInt4(_Out_ XMINT4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUInt4(_Out_ XMUINT4* pDestination, _In_ FXMVECTOR V); - -void XM_CALLCONV XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, _In_ FXMMATRIX M); -void XM_CALLCONV XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, _In_ FXMMATRIX M); -void XM_CALLCONV XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, _In_ FXMMATRIX M); -void XM_CALLCONV XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, _In_ FXMMATRIX M); -void XM_CALLCONV XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, _In_ FXMMATRIX M); - -/**************************************************************************** - * - * General vector operations - * - ****************************************************************************/ - -XMVECTOR XM_CALLCONV XMVectorZero(); -XMVECTOR XM_CALLCONV XMVectorSet(float x, float y, float z, float w); -XMVECTOR XM_CALLCONV XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w); -XMVECTOR XM_CALLCONV XMVectorReplicate(float Value); -XMVECTOR XM_CALLCONV XMVectorReplicatePtr(_In_ const float *pValue); -XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value); -XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr(_In_ const uint32_t *pValue); -XMVECTOR XM_CALLCONV XMVectorTrueInt(); -XMVECTOR XM_CALLCONV XMVectorFalseInt(); -XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorSplatOne(); -XMVECTOR XM_CALLCONV XMVectorSplatInfinity(); -XMVECTOR XM_CALLCONV XMVectorSplatQNaN(); -XMVECTOR XM_CALLCONV XMVectorSplatEpsilon(); -XMVECTOR XM_CALLCONV XMVectorSplatSignMask(); - -float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i); -float XM_CALLCONV XMVectorGetX(FXMVECTOR V); -float XM_CALLCONV XMVectorGetY(FXMVECTOR V); -float XM_CALLCONV XMVectorGetZ(FXMVECTOR V); -float XM_CALLCONV XMVectorGetW(FXMVECTOR V); - -void XM_CALLCONV XMVectorGetByIndexPtr(_Out_ float *f, _In_ FXMVECTOR V, _In_ size_t i); -void XM_CALLCONV XMVectorGetXPtr(_Out_ float *x, _In_ FXMVECTOR V); -void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V); -void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V); -void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V); - -uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i); -uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V); -uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V); -uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V); -uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V); - -void XM_CALLCONV XMVectorGetIntByIndexPtr(_Out_ uint32_t *x, _In_ FXMVECTOR V, _In_ size_t i); -void XM_CALLCONV XMVectorGetIntXPtr(_Out_ uint32_t *x, _In_ FXMVECTOR V); -void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V); -void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V); -void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V); - -XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V,float f, size_t i); -XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x); -XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y); -XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z); -XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w); - -XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(_In_ FXMVECTOR V, _In_ const float *f, _In_ size_t i); -XMVECTOR XM_CALLCONV XMVectorSetXPtr(_In_ FXMVECTOR V, _In_ const float *x); -XMVECTOR XM_CALLCONV XMVectorSetYPtr(_In_ FXMVECTOR V, _In_ const float *y); -XMVECTOR XM_CALLCONV XMVectorSetZPtr(_In_ FXMVECTOR V, _In_ const float *z); -XMVECTOR XM_CALLCONV XMVectorSetWPtr(_In_ FXMVECTOR V, _In_ const float *w); - -XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i); -XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x); -XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y); -XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z); -XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w); - -XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(_In_ FXMVECTOR V, _In_ const uint32_t *x, _In_ size_t i); -XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(_In_ FXMVECTOR V, _In_ const uint32_t *x); -XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(_In_ FXMVECTOR V, _In_ const uint32_t *y); -XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(_In_ FXMVECTOR V, _In_ const uint32_t *z); -XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(_In_ FXMVECTOR V, _In_ const uint32_t *w); - -#if defined(__XNAMATH_H__) && defined(XMVectorSwizzle) -#undef XMVectorSwizzle -#endif - -XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3); -XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW); -XMVECTOR XM_CALLCONV XMVectorSelectControl(uint32_t VectorIndex0, uint32_t VectorIndex1, uint32_t VectorIndex2, uint32_t VectorIndex3); -XMVECTOR XM_CALLCONV XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control); -XMVECTOR XM_CALLCONV XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2); - -#if defined(__XNAMATH_H__) && defined(XMVectorShiftLeft) -#undef XMVectorShiftLeft -#undef XMVectorRotateLeft -#undef XMVectorRotateRight -#undef XMVectorInsert -#endif - -XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements); -XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements); -XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements); -XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements, - uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3); - -XMVECTOR XM_CALLCONV XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorEqualIntR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); -XMVECTOR XM_CALLCONV XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorGreaterR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorLess(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds); -XMVECTOR XM_CALLCONV XMVectorInBoundsR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR Bounds); - -XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V); - -XMVECTOR XM_CALLCONV XMVectorMin(FXMVECTOR V1,FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorMax(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max); -XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V); - -XMVECTOR XM_CALLCONV XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2); - -XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); -XMVECTOR XM_CALLCONV XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); -XMVECTOR XM_CALLCONV XMVectorScale(FXMVECTOR V, float ScaleFactor); -XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorPow(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorMod(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles); -XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V); -void XM_CALLCONV XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V); -void XM_CALLCONV XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVectorATan2(FXMVECTOR Y, FXMVECTOR X); -XMVECTOR XM_CALLCONV XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X); -XMVECTOR XM_CALLCONV XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, float t); -XMVECTOR XM_CALLCONV XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T); -XMVECTOR XM_CALLCONV XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, float t); -XMVECTOR XM_CALLCONV XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, HXMVECTOR T); -XMVECTOR XM_CALLCONV XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, float t); -XMVECTOR XM_CALLCONV XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, HXMVECTOR T); -XMVECTOR XM_CALLCONV XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, float f, float g); -XMVECTOR XM_CALLCONV XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR F, HXMVECTOR G); - -/**************************************************************************** - * - * 2D vector operations - * - ****************************************************************************/ - -bool XM_CALLCONV XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2); -uint32_t XM_CALLCONV XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2); -uint32_t XM_CALLCONV XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); -bool XM_CALLCONV XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2); -uint32_t XM_CALLCONV XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); -uint32_t XM_CALLCONV XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector2Less(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds); - -bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V); -bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V); - -XMVECTOR XM_CALLCONV XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector2ClampLength(FXMVECTOR V, float LengthMin, float LengthMax); -XMVECTOR XM_CALLCONV XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); -XMVECTOR XM_CALLCONV XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal); -XMVECTOR XM_CALLCONV XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex); -XMVECTOR XM_CALLCONV XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); -XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); -XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); -XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point); -XMVECTOR XM_CALLCONV XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, GXMVECTOR Line2Point2); -XMVECTOR XM_CALLCONV XMVector2Transform(FXMVECTOR V, FXMMATRIX M); -XMFLOAT4* XM_CALLCONV XMVector2TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream, - _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); -XMVECTOR XM_CALLCONV XMVector2TransformCoord(FXMVECTOR V, FXMMATRIX M); -XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream, - _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); -XMVECTOR XM_CALLCONV XMVector2TransformNormal(FXMVECTOR V, FXMMATRIX M); -XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream, - _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); - -/**************************************************************************** - * - * 3D vector operations - * - ****************************************************************************/ - -bool XM_CALLCONV XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2); -uint32_t XM_CALLCONV XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2); -uint32_t XM_CALLCONV XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); -bool XM_CALLCONV XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2); -uint32_t XM_CALLCONV XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); -uint32_t XM_CALLCONV XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector3Less(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds); - -bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V); -bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V); - -XMVECTOR XM_CALLCONV XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector3ClampLength(FXMVECTOR V, float LengthMin, float LengthMax); -XMVECTOR XM_CALLCONV XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); -XMVECTOR XM_CALLCONV XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal); -XMVECTOR XM_CALLCONV XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex); -XMVECTOR XM_CALLCONV XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); -XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); -XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); -XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point); -void XM_CALLCONV XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel, _Out_ XMVECTOR* pPerpendicular, _In_ FXMVECTOR V, _In_ FXMVECTOR Normal); -XMVECTOR XM_CALLCONV XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion); -XMVECTOR XM_CALLCONV XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion); -XMVECTOR XM_CALLCONV XMVector3Transform(FXMVECTOR V, FXMMATRIX M); -XMFLOAT4* XM_CALLCONV XMVector3TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, - _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); -XMVECTOR XM_CALLCONV XMVector3TransformCoord(FXMVECTOR V, FXMMATRIX M); -XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, - _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); -XMVECTOR XM_CALLCONV XMVector3TransformNormal(FXMVECTOR V, FXMMATRIX M); -XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, - _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); -XMVECTOR XM_CALLCONV XMVector3Project(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, - FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World); -XMFLOAT3* XM_CALLCONV XMVector3ProjectStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, - _In_ size_t InputStride, _In_ size_t VectorCount, - _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, - _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World); -XMVECTOR XM_CALLCONV XMVector3Unproject(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, - FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World); -XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, - _In_ size_t InputStride, _In_ size_t VectorCount, - _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, - _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World); - -/**************************************************************************** - * - * 4D vector operations - * - ****************************************************************************/ - -bool XM_CALLCONV XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2); -uint32_t XM_CALLCONV XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2); -uint32_t XM_CALLCONV XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); -bool XM_CALLCONV XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2); -uint32_t XM_CALLCONV XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); -uint32_t XM_CALLCONV XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector4Less(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); -bool XM_CALLCONV XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds); - -bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V); -bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V); - -XMVECTOR XM_CALLCONV XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); -XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector4ClampLength(FXMVECTOR V, float LengthMin, float LengthMax); -XMVECTOR XM_CALLCONV XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); -XMVECTOR XM_CALLCONV XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal); -XMVECTOR XM_CALLCONV XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex); -XMVECTOR XM_CALLCONV XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); -XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V); -XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); -XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); -XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); -XMVECTOR XM_CALLCONV XMVector4Transform(FXMVECTOR V, FXMMATRIX M); -XMFLOAT4* XM_CALLCONV XMVector4TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(XMFLOAT4)+InputStride*(VectorCount-1)) const XMFLOAT4* pInputStream, - _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); - -/**************************************************************************** - * - * Matrix operations - * - ****************************************************************************/ - -bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M); -bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M); -bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M); - -XMMATRIX XM_CALLCONV XMMatrixMultiply(FXMMATRIX M1, CXMMATRIX M2); -XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose(FXMMATRIX M1, CXMMATRIX M2); -XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M); -XMMATRIX XM_CALLCONV XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant, _In_ FXMMATRIX M); -XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M); -_Success_(return) -bool XM_CALLCONV XMMatrixDecompose(_Out_ XMVECTOR *outScale, _Out_ XMVECTOR *outRotQuat, _Out_ XMVECTOR *outTrans, _In_ FXMMATRIX M); - -XMMATRIX XM_CALLCONV XMMatrixIdentity(); -XMMATRIX XM_CALLCONV XMMatrixSet(float m00, float m01, float m02, float m03, - float m10, float m11, float m12, float m13, - float m20, float m21, float m22, float m23, - float m30, float m31, float m32, float m33); -XMMATRIX XM_CALLCONV XMMatrixTranslation(float OffsetX, float OffsetY, float OffsetZ); -XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset); -XMMATRIX XM_CALLCONV XMMatrixScaling(float ScaleX, float ScaleY, float ScaleZ); -XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale); -XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle); -XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle); -XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle); -XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw(float Pitch, float Yaw, float Roll); -XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles); -XMMATRIX XM_CALLCONV XMMatrixRotationNormal(FXMVECTOR NormalAxis, float Angle); -XMMATRIX XM_CALLCONV XMMatrixRotationAxis(FXMVECTOR Axis, float Angle); -XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion); -XMMATRIX XM_CALLCONV XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling, - FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation); -XMMATRIX XM_CALLCONV XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling, - GXMVECTOR RotationOrigin, HXMVECTOR RotationQuaternion, HXMVECTOR Translation); -XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, float Rotation, FXMVECTOR Translation); -XMMATRIX XM_CALLCONV XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, GXMVECTOR Translation); -XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane); -XMMATRIX XM_CALLCONV XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition); - -XMMATRIX XM_CALLCONV XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection); -XMMATRIX XM_CALLCONV XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection); -XMMATRIX XM_CALLCONV XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection); -XMMATRIX XM_CALLCONV XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection); -XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); -XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); -XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH(float FovAngleY, float AspectRatio, float NearZ, float FarZ); -XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH(float FovAngleY, float AspectRatio, float NearZ, float FarZ); -XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); -XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); -XMMATRIX XM_CALLCONV XMMatrixOrthographicLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); -XMMATRIX XM_CALLCONV XMMatrixOrthographicRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); -XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); -XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); - - -/**************************************************************************** - * - * Quaternion operations - * - ****************************************************************************/ - -bool XM_CALLCONV XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2); -bool XM_CALLCONV XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2); - -bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q); -bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q); -bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q); - -XMVECTOR XM_CALLCONV XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2); -XMVECTOR XM_CALLCONV XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2); -XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q); -XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q); -XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q); -XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q); -XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q); -XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q); -XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q); -XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q); -XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q); -XMVECTOR XM_CALLCONV XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, float t); -XMVECTOR XM_CALLCONV XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T); -XMVECTOR XM_CALLCONV XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, float t); -XMVECTOR XM_CALLCONV XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, HXMVECTOR T); -void XM_CALLCONV XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB, _Out_ XMVECTOR* pC, _In_ FXMVECTOR Q0, _In_ FXMVECTOR Q1, _In_ FXMVECTOR Q2, _In_ GXMVECTOR Q3); -XMVECTOR XM_CALLCONV XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, float f, float g); -XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR F, HXMVECTOR G); - -XMVECTOR XM_CALLCONV XMQuaternionIdentity(); -XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw, float Roll); -XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles); -XMVECTOR XM_CALLCONV XMQuaternionRotationNormal(FXMVECTOR NormalAxis, float Angle); -XMVECTOR XM_CALLCONV XMQuaternionRotationAxis(FXMVECTOR Axis, float Angle); -XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M); - -void XM_CALLCONV XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis, _Out_ float* pAngle, _In_ FXMVECTOR Q); - -/**************************************************************************** - * - * Plane operations - * - ****************************************************************************/ - -bool XM_CALLCONV XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2); -bool XM_CALLCONV XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon); -bool XM_CALLCONV XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2); - -bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P); -bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P); - -XMVECTOR XM_CALLCONV XMPlaneDot(FXMVECTOR P, FXMVECTOR V); -XMVECTOR XM_CALLCONV XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V); -XMVECTOR XM_CALLCONV XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V); -XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P); -XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P); -XMVECTOR XM_CALLCONV XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2); -void XM_CALLCONV XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1, _Out_ XMVECTOR* pLinePoint2, _In_ FXMVECTOR P1, _In_ FXMVECTOR P2); -XMVECTOR XM_CALLCONV XMPlaneTransform(FXMVECTOR P, FXMMATRIX M); -XMFLOAT4* XM_CALLCONV XMPlaneTransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(PlaneCount-1)) XMFLOAT4* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(XMFLOAT4)+InputStride*(PlaneCount-1)) const XMFLOAT4* pInputStream, - _In_ size_t InputStride, _In_ size_t PlaneCount, _In_ FXMMATRIX M); - -XMVECTOR XM_CALLCONV XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal); -XMVECTOR XM_CALLCONV XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3); - -/**************************************************************************** - * - * Color operations - * - ****************************************************************************/ - -bool XM_CALLCONV XMColorEqual(FXMVECTOR C1, FXMVECTOR C2); -bool XM_CALLCONV XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2); -bool XM_CALLCONV XMColorGreater(FXMVECTOR C1, FXMVECTOR C2); -bool XM_CALLCONV XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2); -bool XM_CALLCONV XMColorLess(FXMVECTOR C1, FXMVECTOR C2); -bool XM_CALLCONV XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2); - -bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C); -bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C); - -XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR C); -XMVECTOR XM_CALLCONV XMColorModulate(FXMVECTOR C1, FXMVECTOR C2); -XMVECTOR XM_CALLCONV XMColorAdjustSaturation(FXMVECTOR C, float Saturation); -XMVECTOR XM_CALLCONV XMColorAdjustContrast(FXMVECTOR C, float Contrast); - -XMVECTOR XM_CALLCONV XMColorRGBToHSL( FXMVECTOR rgb ); -XMVECTOR XM_CALLCONV XMColorHSLToRGB( FXMVECTOR hsl ); - -XMVECTOR XM_CALLCONV XMColorRGBToHSV( FXMVECTOR rgb ); -XMVECTOR XM_CALLCONV XMColorHSVToRGB( FXMVECTOR hsv ); - -XMVECTOR XM_CALLCONV XMColorRGBToYUV( FXMVECTOR rgb ); -XMVECTOR XM_CALLCONV XMColorYUVToRGB( FXMVECTOR yuv ); - -XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD( FXMVECTOR rgb ); -XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD( FXMVECTOR yuv ); - -XMVECTOR XM_CALLCONV XMColorRGBToXYZ( FXMVECTOR rgb ); -XMVECTOR XM_CALLCONV XMColorXYZToRGB( FXMVECTOR xyz ); - -XMVECTOR XM_CALLCONV XMColorXYZToSRGB( FXMVECTOR xyz ); -XMVECTOR XM_CALLCONV XMColorSRGBToXYZ( FXMVECTOR srgb ); - -XMVECTOR XM_CALLCONV XMColorRGBToSRGB( FXMVECTOR rgb ); -XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb ); - - -/**************************************************************************** - * - * Miscellaneous operations - * - ****************************************************************************/ - -bool XMVerifyCPUSupport(); - -XMVECTOR XM_CALLCONV XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex); - -bool XMScalarNearEqual(float S1, float S2, float Epsilon); -float XMScalarModAngle(float Value); - -float XMScalarSin(float Value); -float XMScalarSinEst(float Value); - -float XMScalarCos(float Value); -float XMScalarCosEst(float Value); - -void XMScalarSinCos(_Out_ float* pSin, _Out_ float* pCos, float Value); -void XMScalarSinCosEst(_Out_ float* pSin, _Out_ float* pCos, float Value); - -float XMScalarASin(float Value); -float XMScalarASinEst(float Value); - -float XMScalarACos(float Value); -float XMScalarACosEst(float Value); - -/**************************************************************************** - * - * Templates - * - ****************************************************************************/ - -#if defined(__XNAMATH_H__) && defined(XMMin) -#undef XMMin -#undef XMMax -#endif - -template inline T XMMin(T a, T b) { return (a < b) ? a : b; } -template inline T XMMax(T a, T b) { return (a > b) ? a : b; } - -//------------------------------------------------------------------------------ - -#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - -// PermuteHelper internal template (SSE only) -namespace Internal -{ - // Slow path fallback for permutes that do not map to a single SSE shuffle opcode. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) - { - static const XMVECTORU32 selectMask = - { - WhichX ? 0xFFFFFFFF : 0, - WhichY ? 0xFFFFFFFF : 0, - WhichZ ? 0xFFFFFFFF : 0, - WhichW ? 0xFFFFFFFF : 0, - }; - - XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle); - XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle); - - XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); - XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); - - return _mm_or_ps(masked1, masked2); - } - }; - - // Fast path for permutes that only read from the first vector. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return XM_PERMUTE_PS(v1, Shuffle); } - }; - - // Fast path for permutes that only read from the second vector. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return XM_PERMUTE_PS(v2, Shuffle); } - }; - - // Fast path for permutes that read XY from the first vector, ZW from the second. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); } - }; - - // Fast path for permutes that read XY from the second vector, ZW from the first. - template struct PermuteHelper - { - static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); } - }; -}; - -#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ - -// General permute template -template - inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) -{ - static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); - static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); - static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); - static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); - -#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); - - const bool WhichX = PermuteX > 3; - const bool WhichY = PermuteY > 3; - const bool WhichZ = PermuteZ > 3; - const bool WhichW = PermuteW > 3; - - return Internal::PermuteHelper::Permute(V1, V2); -#else - - return XMVectorPermute( V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW ); - -#endif -} - -// Special-case permute templates -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; } - -#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_movelh_ps(V1,V2); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<6,7,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_movehl_ps(V1,V2); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,4,1,5>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_unpacklo_ps(V1,V2); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,6,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_unpackhi_ps(V1,V2); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(V1), _mm_castps_pd(V2))); } -#endif - -#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); } -#endif - -#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - -// If the indices are all in the range 0-3 or 4-7, then use XMVectorSwizzle instead -// The mirror cases are not spelled out here as the programmer can always swap the arguments -// (i.e. prefer permutes where the X element comes from the V1 vector instead of the V2 vector) - -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_low_f32(V2) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,0,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_low_f32(V2) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,0,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); } - -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vget_high_f32(V2) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,2,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_high_f32(V2) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,2,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); } - -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_high_f32(V2) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,0,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_high_f32(V2) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,0,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); } - -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,2,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_low_f32(V2) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,2,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); } - -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,4,2,6>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[0]; } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,5,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[1]; } - -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,4,1,5>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[0]; } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,6,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[1]; } - -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,2,4,6>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[0]; } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,3,5,7>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[1]; } - -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,2,3,4>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 1); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 2); } -template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,4,5,6>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 3); } - -#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ - -//------------------------------------------------------------------------------ - -// General swizzle template -template - inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) -{ - static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); - static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); - static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); - static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); - -#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - return XM_PERMUTE_PS( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) ); -#else - - return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW ); - -#endif -} - -// Specialized swizzles -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; } - -#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,0,1>(FXMVECTOR V) { return _mm_movelh_ps(V,V); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,3,2,3>(FXMVECTOR V) { return _mm_movehl_ps(V,V); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,1,1>(FXMVECTOR V) { return _mm_unpacklo_ps(V,V); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,2,3,3>(FXMVECTOR V) { return _mm_unpackhi_ps(V,V); } -#endif - -#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); } -#endif - -#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,1,1>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 1); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,2,2,2>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 0); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,3,3,3>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 1); } - -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,0,3,2>(FXMVECTOR V) { return vrev64q_f32(V); } - -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,0,1>(FXMVECTOR V) { float32x2_t vt = vget_low_f32(V); return vcombine_f32( vt, vt ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,3,2,3>(FXMVECTOR V) { float32x2_t vt = vget_high_f32(V); return vcombine_f32( vt, vt ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,0,1,0>(FXMVECTOR V) { float32x2_t vt = vrev64_f32( vget_low_f32(V) ); return vcombine_f32( vt, vt ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,2,3,2>(FXMVECTOR V) { float32x2_t vt = vrev64_f32( vget_high_f32(V) ); return vcombine_f32( vt, vt ); } - -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,3,2>(FXMVECTOR V) { return vcombine_f32( vget_low_f32(V), vrev64_f32( vget_high_f32(V) ) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,0,2,3>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_low_f32(V) ), vget_high_f32(V) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,3,1,0>(FXMVECTOR V) { return vcombine_f32( vget_high_f32(V), vrev64_f32( vget_low_f32(V) ) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,2,0,1>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vget_low_f32(V) ); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,2,1,0>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vrev64_f32( vget_low_f32(V) ) ); } - -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return vtrnq_f32(V,V).val[0]; } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return vtrnq_f32(V,V).val[1]; } - -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,1,1>(FXMVECTOR V) { return vzipq_f32(V,V).val[0]; } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,2,3,3>(FXMVECTOR V) { return vzipq_f32(V,V).val[1]; } - -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,2,0,2>(FXMVECTOR V) { return vuzpq_f32(V,V).val[0]; } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,3,1,3>(FXMVECTOR V) { return vuzpq_f32(V,V).val[1]; } - -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,2,3,0>(FXMVECTOR V) { return vextq_f32(V, V, 1); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,3,0,1>(FXMVECTOR V) { return vextq_f32(V, V, 2); } -template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,0,1,2>(FXMVECTOR V) { return vextq_f32(V, V, 3); } - -#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ - -//------------------------------------------------------------------------------ - -template - inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) -{ - static_assert( Elements < 4, "Elements template parameter out of range" ); - return XMVectorPermute(V1, V2); -} - -template - inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) -{ - static_assert( Elements < 4, "Elements template parameter out of range" ); - return XMVectorSwizzle(V); -} - -template - inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) -{ - static_assert( Elements < 4, "Elements template parameter out of range" ); - return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); -} - -template - inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS) -{ - XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1); - return XMVectorSelect( VD, XMVectorRotateLeft(VS), Control ); -} - -/**************************************************************************** - * - * Globals - * - ****************************************************************************/ - -// The purpose of the following global constants is to prevent redundant -// reloading of the constants when they are referenced by more than one -// separate inline math routine called within the same function. Declaring -// a constant locally within a routine is sufficient to prevent redundant -// reloads of that constant when that single routine is called multiple -// times in a function, but if the constant is used (and declared) in a -// separate math routine it would be reloaded. - -#ifndef XMGLOBALCONST -#define XMGLOBALCONST extern const __declspec(selectany) -#endif - -XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0 = {-0.16666667f, +0.0083333310f, -0.00019840874f, +2.7525562e-06f}; -XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1 = {-2.3889859e-08f, -0.16665852f /*Est1*/, +0.0083139502f /*Est2*/, -0.00018524670f /*Est3*/}; -XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0 = {-0.5f, +0.041666638f, -0.0013888378f, +2.4760495e-05f}; -XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1 = {-2.6051615e-07f, -0.49992746f /*Est1*/, +0.041493919f /*Est2*/, -0.0012712436f /*Est3*/}; -XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0 = {1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f}; -XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1 = {2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f}; -XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2 = {5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f}; -XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients0 = {+1.5707963050f, -0.2145988016f, +0.0889789874f, -0.0501743046f}; -XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients1 = {+0.0308918810f, -0.0170881256f, +0.0066700901f, -0.0012624911f}; -XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0 = {-0.3333314528f, +0.1999355085f, -0.1420889944f, +0.1065626393f}; -XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1 = {-0.0752896400f, +0.0429096138f, -0.0161657367f, +0.0028662257f}; -XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients0 = {+0.999866f, +0.999866f, +0.999866f, +0.999866f}; -XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients1 = {-0.3302995f, +0.180141f, -0.085133f, +0.0208351f}; -XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients = {2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI}; -XMGLOBALCONST XMVECTORF32 g_XMArcEstCoefficients = {+1.5707288f,-0.2121144f,+0.0742610f,-0.0187293f}; -XMGLOBALCONST XMVECTORF32 g_XMPiConstants0 = {XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI}; -XMGLOBALCONST XMVECTORF32 g_XMIdentityR0 = {1.0f, 0.0f, 0.0f, 0.0f}; -XMGLOBALCONST XMVECTORF32 g_XMIdentityR1 = {0.0f, 1.0f, 0.0f, 0.0f}; -XMGLOBALCONST XMVECTORF32 g_XMIdentityR2 = {0.0f, 0.0f, 1.0f, 0.0f}; -XMGLOBALCONST XMVECTORF32 g_XMIdentityR3 = {0.0f, 0.0f, 0.0f, 1.0f}; -XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = {-1.0f,0.0f, 0.0f, 0.0f}; -XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = {0.0f,-1.0f, 0.0f, 0.0f}; -XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = {0.0f, 0.0f,-1.0f, 0.0f}; -XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = {0.0f, 0.0f, 0.0f,-1.0f}; -XMGLOBALCONST XMVECTORU32 g_XMNegativeZero = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -XMGLOBALCONST XMVECTORU32 g_XMNegate3 = {0x80000000, 0x80000000, 0x80000000, 0x00000000}; -XMGLOBALCONST XMVECTORU32 g_XMMaskXY = {0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000}; -XMGLOBALCONST XMVECTORU32 g_XMMask3 = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000}; -XMGLOBALCONST XMVECTORU32 g_XMMaskX = {0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000}; -XMGLOBALCONST XMVECTORU32 g_XMMaskY = {0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000}; -XMGLOBALCONST XMVECTORU32 g_XMMaskZ = {0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000}; -XMGLOBALCONST XMVECTORU32 g_XMMaskW = {0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF}; -XMGLOBALCONST XMVECTORF32 g_XMOne = { 1.0f, 1.0f, 1.0f, 1.0f}; -XMGLOBALCONST XMVECTORF32 g_XMOne3 = { 1.0f, 1.0f, 1.0f, 0.0f}; -XMGLOBALCONST XMVECTORF32 g_XMZero = { 0.0f, 0.0f, 0.0f, 0.0f}; -XMGLOBALCONST XMVECTORF32 g_XMTwo = { 2.f, 2.f, 2.f, 2.f }; -XMGLOBALCONST XMVECTORF32 g_XMFour = { 4.f, 4.f, 4.f, 4.f }; -XMGLOBALCONST XMVECTORF32 g_XMSix = { 6.f, 6.f, 6.f, 6.f }; -XMGLOBALCONST XMVECTORF32 g_XMNegativeOne = {-1.0f,-1.0f,-1.0f,-1.0f}; -XMGLOBALCONST XMVECTORF32 g_XMOneHalf = { 0.5f, 0.5f, 0.5f, 0.5f}; -XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf = {-0.5f,-0.5f,-0.5f,-0.5f}; -XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi = {-XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI}; -XMGLOBALCONST XMVECTORF32 g_XMNegativePi = {-XM_PI, -XM_PI, -XM_PI, -XM_PI}; -XMGLOBALCONST XMVECTORF32 g_XMHalfPi = {XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2}; -XMGLOBALCONST XMVECTORF32 g_XMPi = {XM_PI, XM_PI, XM_PI, XM_PI}; -XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi = {XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI}; -XMGLOBALCONST XMVECTORF32 g_XMTwoPi = {XM_2PI, XM_2PI, XM_2PI, XM_2PI}; -XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi = {XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI}; -XMGLOBALCONST XMVECTORF32 g_XMEpsilon = {1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f}; -XMGLOBALCONST XMVECTORI32 g_XMInfinity = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; -XMGLOBALCONST XMVECTORI32 g_XMQNaN = {0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000}; -XMGLOBALCONST XMVECTORI32 g_XMQNaNTest = {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; -XMGLOBALCONST XMVECTORI32 g_XMAbsMask = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; -XMGLOBALCONST XMVECTORI32 g_XMFltMin = {0x00800000, 0x00800000, 0x00800000, 0x00800000}; -XMGLOBALCONST XMVECTORI32 g_XMFltMax = {0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF}; -XMGLOBALCONST XMVECTORU32 g_XMNegOneMask = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}; -XMGLOBALCONST XMVECTORU32 g_XMMaskA8R8G8B8 = {0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000}; -XMGLOBALCONST XMVECTORU32 g_XMFlipA8R8G8B8 = {0x00000000, 0x00000000, 0x00000000, 0x80000000}; -XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8 = {0.0f,0.0f,0.0f,(float)(0x80000000U)}; -XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = {1.0f/(255.0f*(float)(0x10000)),1.0f/(255.0f*(float)(0x100)),1.0f/255.0f,1.0f/(255.0f*(float)(0x1000000))}; -XMGLOBALCONST XMVECTORU32 g_XMMaskA2B10G10R10 = {0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000}; -XMGLOBALCONST XMVECTORU32 g_XMFlipA2B10G10R10 = {0x00000200, 0x00080000, 0x20000000, 0x80000000}; -XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10 = {-512.0f,-512.0f*(float)(0x400),-512.0f*(float)(0x100000),(float)(0x80000000U)}; -XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = {1.0f/511.0f,1.0f/(511.0f*(float)(0x400)),1.0f/(511.0f*(float)(0x100000)),1.0f/(3.0f*(float)(0x40000000))}; -XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16 = {0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000}; -XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16 = {0x00008000, 0x00000000, 0x00000000, 0x00000000}; -XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16 = {-32768.0f,0.0f,0.0f,0.0f}; -XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16 = {1.0f/32767.0f,1.0f/(32767.0f*65536.0f),0.0f,0.0f}; -XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16Z16W16 = {0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000}; -XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16 = {0x00008000, 0x00008000, 0x00000000, 0x00000000}; -XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16 = {-32768.0f,-32768.0f,0.0f,0.0f}; -XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = {1.0f/32767.0f,1.0f/32767.0f,1.0f/(32767.0f*65536.0f),1.0f/(32767.0f*65536.0f)}; -XMGLOBALCONST XMVECTORF32 g_XMNoFraction = {8388608.0f,8388608.0f,8388608.0f,8388608.0f}; -XMGLOBALCONST XMVECTORI32 g_XMMaskByte = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF}; -XMGLOBALCONST XMVECTORF32 g_XMNegateX = {-1.0f, 1.0f, 1.0f, 1.0f}; -XMGLOBALCONST XMVECTORF32 g_XMNegateY = { 1.0f,-1.0f, 1.0f, 1.0f}; -XMGLOBALCONST XMVECTORF32 g_XMNegateZ = { 1.0f, 1.0f,-1.0f, 1.0f}; -XMGLOBALCONST XMVECTORF32 g_XMNegateW = { 1.0f, 1.0f, 1.0f,-1.0f}; -XMGLOBALCONST XMVECTORU32 g_XMSelect0101 = {XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1}; -XMGLOBALCONST XMVECTORU32 g_XMSelect1010 = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0}; -XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD}; -XMGLOBALCONST XMVECTORU32 g_XMSelect1000 = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0}; -XMGLOBALCONST XMVECTORU32 g_XMSelect1100 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0}; -XMGLOBALCONST XMVECTORU32 g_XMSelect1110 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0}; -XMGLOBALCONST XMVECTORU32 g_XMSelect1011 = { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 }; -XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = {1.0f,1.0f/65536.0f,0.0f,0.0f}; -XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = {1.0f,1.0f,1.0f/65536.0f,1.0f/65536.0f}; -XMGLOBALCONST XMVECTORU32 g_XMFlipY = {0,0x80000000,0,0}; -XMGLOBALCONST XMVECTORU32 g_XMFlipZ = {0,0,0x80000000,0}; -XMGLOBALCONST XMVECTORU32 g_XMFlipW = {0,0,0,0x80000000}; -XMGLOBALCONST XMVECTORU32 g_XMFlipYZ = {0,0x80000000,0x80000000,0}; -XMGLOBALCONST XMVECTORU32 g_XMFlipZW = {0,0,0x80000000,0x80000000}; -XMGLOBALCONST XMVECTORU32 g_XMFlipYW = {0,0x80000000,0,0x80000000}; -XMGLOBALCONST XMVECTORI32 g_XMMaskDec4 = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30}; -XMGLOBALCONST XMVECTORI32 g_XMXorDec4 = {0x200,0x200<<10,0x200<<20,0}; -XMGLOBALCONST XMVECTORF32 g_XMAddUDec4 = {0,0,0,32768.0f*65536.0f}; -XMGLOBALCONST XMVECTORF32 g_XMAddDec4 = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,0}; -XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = {1.0f,1.0f/1024.0f,1.0f/(1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)}; -XMGLOBALCONST XMVECTORU32 g_XMMaskByte4 = {0xFF,0xFF00,0xFF0000,0xFF000000}; -XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = {0x80,0x8000,0x800000,0x00000000}; -XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = {-128.0f,-128.0f*256.0f,-128.0f*65536.0f,0}; -XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f}; -XMGLOBALCONST XMVECTORF32 g_XMMaxInt = {65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f}; -XMGLOBALCONST XMVECTORF32 g_XMMaxUInt = {65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f}; -XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f}; -XMGLOBALCONST XMVECTORF32 g_XMsrgbScale = { 12.92f, 12.92f, 12.92f, 1.0f }; -XMGLOBALCONST XMVECTORF32 g_XMsrgbA = { 0.055f, 0.055f, 0.055f, 0.0f }; -XMGLOBALCONST XMVECTORF32 g_XMsrgbA1 = { 1.055f, 1.055f, 1.055f, 1.0f }; -XMGLOBALCONST XMVECTORI32 g_XMExponentBias = {127, 127, 127, 127}; -XMGLOBALCONST XMVECTORI32 g_XMSubnormalExponent = {-126, -126, -126, -126}; -XMGLOBALCONST XMVECTORI32 g_XMNumTrailing = {23, 23, 23, 23}; -XMGLOBALCONST XMVECTORI32 g_XMMinNormal = {0x00800000, 0x00800000, 0x00800000, 0x00800000}; -XMGLOBALCONST XMVECTORU32 g_XMNegInfinity = {0xFF800000, 0xFF800000, 0xFF800000, 0xFF800000}; -XMGLOBALCONST XMVECTORU32 g_XMNegQNaN = {0xFFC00000, 0xFFC00000, 0xFFC00000, 0xFFC00000}; -XMGLOBALCONST XMVECTORI32 g_XMBin128 = {0x43000000, 0x43000000, 0x43000000, 0x43000000}; -XMGLOBALCONST XMVECTORU32 g_XMBinNeg150 = {0xC3160000, 0xC3160000, 0xC3160000, 0xC3160000}; -XMGLOBALCONST XMVECTORI32 g_XM253 = {253, 253, 253, 253}; -XMGLOBALCONST XMVECTORF32 g_XMExpEst1 = {-6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f}; -XMGLOBALCONST XMVECTORF32 g_XMExpEst2 = {+2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f}; -XMGLOBALCONST XMVECTORF32 g_XMExpEst3 = {-5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f}; -XMGLOBALCONST XMVECTORF32 g_XMExpEst4 = {+9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f}; -XMGLOBALCONST XMVECTORF32 g_XMExpEst5 = {-1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f}; -XMGLOBALCONST XMVECTORF32 g_XMExpEst6 = {+1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f}; -XMGLOBALCONST XMVECTORF32 g_XMExpEst7 = {-1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f}; -XMGLOBALCONST XMVECTORF32 g_XMLogEst0 = {+1.442693f, +1.442693f, +1.442693f, +1.442693f}; -XMGLOBALCONST XMVECTORF32 g_XMLogEst1 = {-0.721242f, -0.721242f, -0.721242f, -0.721242f}; -XMGLOBALCONST XMVECTORF32 g_XMLogEst2 = {+0.479384f, +0.479384f, +0.479384f, +0.479384f}; -XMGLOBALCONST XMVECTORF32 g_XMLogEst3 = {-0.350295f, -0.350295f, -0.350295f, -0.350295f}; -XMGLOBALCONST XMVECTORF32 g_XMLogEst4 = {+0.248590f, +0.248590f, +0.248590f, +0.248590f}; -XMGLOBALCONST XMVECTORF32 g_XMLogEst5 = {-0.145700f, -0.145700f, -0.145700f, -0.145700f}; -XMGLOBALCONST XMVECTORF32 g_XMLogEst6 = {+0.057148f, +0.057148f, +0.057148f, +0.057148f}; -XMGLOBALCONST XMVECTORF32 g_XMLogEst7 = {-0.010578f, -0.010578f, -0.010578f, -0.010578f}; -XMGLOBALCONST XMVECTORF32 g_XMLgE = {+1.442695f, +1.442695f, +1.442695f, +1.442695f}; -XMGLOBALCONST XMVECTORF32 g_XMInvLgE = {+6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f}; -XMGLOBALCONST XMVECTORF32 g_UByteMax = {255.0f, 255.0f, 255.0f, 255.0f}; -XMGLOBALCONST XMVECTORF32 g_ByteMin = {-127.0f, -127.0f, -127.0f, -127.0f}; -XMGLOBALCONST XMVECTORF32 g_ByteMax = {127.0f, 127.0f, 127.0f, 127.0f}; -XMGLOBALCONST XMVECTORF32 g_ShortMin = {-32767.0f, -32767.0f, -32767.0f, -32767.0f}; -XMGLOBALCONST XMVECTORF32 g_ShortMax = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; -XMGLOBALCONST XMVECTORF32 g_UShortMax = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; - -/**************************************************************************** - * - * Implementation - * - ****************************************************************************/ - -#pragma warning(push) -#pragma warning(disable:4068 4214 4204 4365 4616 4640 6001 6101) -// C4068/4616: ignore unknown pragmas -// C4214/4204: nonstandard extension used -// C4365/4640: Off by default noise -// C6001/6101: False positives - -#pragma prefast(push) -#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTORU32 vResult; - vResult.u[0] = (0-(C0&1)) & 0x3F800000; - vResult.u[1] = (0-(C1&1)) & 0x3F800000; - vResult.u[2] = (0-(C2&1)) & 0x3F800000; - vResult.u[3] = (0-(C3&1)) & 0x3F800000; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTORU32 vResult; - vResult.u[0] = (0-(C0&1)) & 0x3F800000; - vResult.u[1] = (0-(C1&1)) & 0x3F800000; - vResult.u[2] = (0-(C2&1)) & 0x3F800000; - vResult.u[3] = (0-(C3&1)) & 0x3F800000; - return vResult.v; -#else // XM_SSE_INTRINSICS_ - static const XMVECTORU32 g_vMask1 = {1,1,1,1}; - // Move the parms to a vector - __m128i vTemp = _mm_set_epi32(C3,C2,C1,C0); - // Mask off the low bits - vTemp = _mm_and_si128(vTemp,g_vMask1); - // 0xFFFFFFFF on true bits - vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1); - // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f - vTemp = _mm_and_si128(vTemp,g_XMOne); - return _mm_castsi128_ps(vTemp); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) -{ - assert( IntConstant >= -16 && IntConstant <= 15 ); - assert( DivExponent < 32 ); -#if defined(_XM_NO_INTRINSICS_) - - using DirectX::XMConvertVectorIntToFloat; - - XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant }; - return XMConvertVectorIntToFloat( V.v, DivExponent); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Splat the int - int32x4_t vScale = vdupq_n_s32(IntConstant); - // Convert to a float - XMVECTOR vResult = vcvtq_f32_s32(vScale); - // Convert DivExponent into 1.0f/(1<(&vScale)[0]); - return vResult; -#else // XM_SSE_INTRINSICS_ - // Splat the int - __m128i vScale = _mm_set1_epi32(IntConstant); - // Convert to a float - XMVECTOR vResult = _mm_cvtepi32_ps(vScale); - // Convert DivExponent into 1.0f/(1<= -16 && IntConstant <= 15 ); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant }; - return V.v; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - int32x4_t V = vdupq_n_s32( IntConstant ); - return reinterpret_cast(&V)[0]; -#else // XM_SSE_INTRINSICS_ - __m128i V = _mm_set1_epi32( IntConstant ); - return _mm_castsi128_ps(V); -#endif -} - -#include "DirectXMathConvert.inl" -#include "DirectXMathVector.inl" -#include "DirectXMathMatrix.inl" -#include "DirectXMathMisc.inl" - -#pragma prefast(pop) -#pragma warning(pop) - -}; // namespace DirectX - +//------------------------------------------------------------------------------------- +// DirectXMath.h -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#ifndef __cplusplus +#error DirectX Math requires C++ +#endif + +#define DIRECTX_MATH_VERSION 309 + +#if defined(_MSC_VER) && (_MSC_VER < 1800) +#error DirectX Math Visual C++ 2013 or later. +#endif + +#if defined(_MSC_VER) && !defined(_M_ARM) && !defined(_M_ARM64) && (!_MANAGED) && (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) && !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_) +#define _XM_VECTORCALL_ 1 +#endif + +#if _XM_VECTORCALL_ +#define XM_CALLCONV __vectorcall +#else +#define XM_CALLCONV __fastcall +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1800) +#define XM_CTOR_DEFAULT {} +#else +#define XM_CTOR_DEFAULT =default; +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1900) +#define XM_CONSTEXPR const +#else +#define XM_CONSTEXPR constexpr +#endif + +#ifndef XM_DEPRECATED +#define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version.")) +#endif + +#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_F16C_INTRINSICS_ +#endif + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_) +#define _XM_AVX_INTRINSICS_ +#endif + +#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_) +#define _XM_SSE4_INTRINSICS_ +#endif + +#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE3_INTRINSICS_) +#define _XM_SSE3_INTRINSICS_ +#endif + +#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) +#define _XM_SSE_INTRINSICS_ +#endif + +#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#if defined(_M_IX86) || defined(_M_X64) +#define _XM_SSE_INTRINSICS_ +#elif defined(_M_ARM) || defined(_M_ARM64) +#define _XM_ARM_NEON_INTRINSICS_ +#elif !defined(_XM_NO_INTRINSICS_) +#error DirectX Math does not support this target +#endif +#endif // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +#pragma warning(push) +#pragma warning(disable:4514 4820) +// C4514/4820: Off by default noise +#include +#include +#include +#pragma warning(pop) + +#ifndef _XM_NO_INTRINSICS_ +#pragma warning(push) +#pragma warning(disable : 4987) +// C4987: Off by default noise +#include +#pragma warning(pop) + +#ifdef _XM_SSE_INTRINSICS_ +#include +#include + +#ifdef _XM_SSE3_INTRINSICS_ +#include +#endif + +#ifdef _XM_SSE4_INTRINSICS_ +#include +#endif + +#ifdef _XM_AVX_INTRINSICS_ +#include +#endif + +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#ifdef _M_ARM64 +#include +#else +#include +#endif +#endif +#endif // !_XM_NO_INTRINSICS_ + +#include +#include + +#ifndef _XM_NO_ROUNDF_ +#ifdef _MSC_VER +#include +#if defined(_CPPLIB_VER) && ( _CPPLIB_VER < 610 ) +#define _XM_NO_ROUNDF_ +#endif +#endif +#endif + +#pragma warning(push) +#pragma warning(disable : 4005 4668) +// C4005/4668: Old header issue +#include +#pragma warning(pop) + +/**************************************************************************** + * + * Conditional intrinsics + * + ****************************************************************************/ + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +#if defined(_XM_NO_MOVNT_) +#define XM_STREAM_PS( p, a ) _mm_store_ps( p, a ) +#define XM_SFENCE() +#else +#define XM_STREAM_PS( p, a ) _mm_stream_ps( p, a ) +#define XM_SFENCE() _mm_sfence() +#endif + +#if defined(_XM_AVX_INTRINSICS_) +#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c ) +#else +#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c ) +#endif + +#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +namespace DirectX +{ + +/**************************************************************************** + * + * Constant definitions + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XM_PI) +#undef XM_PI +#undef XM_2PI +#undef XM_1DIVPI +#undef XM_1DIV2PI +#undef XM_PIDIV2 +#undef XM_PIDIV4 +#undef XM_SELECT_0 +#undef XM_SELECT_1 +#undef XM_PERMUTE_0X +#undef XM_PERMUTE_0Y +#undef XM_PERMUTE_0Z +#undef XM_PERMUTE_0W +#undef XM_PERMUTE_1X +#undef XM_PERMUTE_1Y +#undef XM_PERMUTE_1Z +#undef XM_PERMUTE_1W +#undef XM_CRMASK_CR6 +#undef XM_CRMASK_CR6TRUE +#undef XM_CRMASK_CR6FALSE +#undef XM_CRMASK_CR6BOUNDS +#undef XM_CACHE_LINE_SIZE +#endif + +XM_CONSTEXPR float XM_PI = 3.141592654f; +XM_CONSTEXPR float XM_2PI = 6.283185307f; +XM_CONSTEXPR float XM_1DIVPI = 0.318309886f; +XM_CONSTEXPR float XM_1DIV2PI = 0.159154943f; +XM_CONSTEXPR float XM_PIDIV2 = 1.570796327f; +XM_CONSTEXPR float XM_PIDIV4 = 0.785398163f; + +XM_CONSTEXPR uint32_t XM_SELECT_0 = 0x00000000; +XM_CONSTEXPR uint32_t XM_SELECT_1 = 0xFFFFFFFF; + +XM_CONSTEXPR uint32_t XM_PERMUTE_0X = 0; +XM_CONSTEXPR uint32_t XM_PERMUTE_0Y = 1; +XM_CONSTEXPR uint32_t XM_PERMUTE_0Z = 2; +XM_CONSTEXPR uint32_t XM_PERMUTE_0W = 3; +XM_CONSTEXPR uint32_t XM_PERMUTE_1X = 4; +XM_CONSTEXPR uint32_t XM_PERMUTE_1Y = 5; +XM_CONSTEXPR uint32_t XM_PERMUTE_1Z = 6; +XM_CONSTEXPR uint32_t XM_PERMUTE_1W = 7; + +XM_CONSTEXPR uint32_t XM_SWIZZLE_X = 0; +XM_CONSTEXPR uint32_t XM_SWIZZLE_Y = 1; +XM_CONSTEXPR uint32_t XM_SWIZZLE_Z = 2; +XM_CONSTEXPR uint32_t XM_SWIZZLE_W = 3; + +XM_CONSTEXPR uint32_t XM_CRMASK_CR6 = 0x000000F0; +XM_CONSTEXPR uint32_t XM_CRMASK_CR6TRUE = 0x00000080; +XM_CONSTEXPR uint32_t XM_CRMASK_CR6FALSE = 0x00000020; +XM_CONSTEXPR uint32_t XM_CRMASK_CR6BOUNDS = XM_CRMASK_CR6FALSE; + +XM_CONSTEXPR size_t XM_CACHE_LINE_SIZE = 64; + + +/**************************************************************************** + * + * Macros + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XMComparisonAllTrue) +#undef XMComparisonAllTrue +#undef XMComparisonAnyTrue +#undef XMComparisonAllFalse +#undef XMComparisonAnyFalse +#undef XMComparisonMixed +#undef XMComparisonAllInBounds +#undef XMComparisonAnyOutOfBounds +#endif + +// Unit conversion + +inline XM_CONSTEXPR float XMConvertToRadians(float fDegrees) { return fDegrees * (XM_PI / 180.0f); } +inline XM_CONSTEXPR float XMConvertToDegrees(float fRadians) { return fRadians * (180.0f / XM_PI); } + +// Condition register evaluation proceeding a recording (R) comparison + +inline bool XMComparisonAllTrue(uint32_t CR) { return (((CR) & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE); } +inline bool XMComparisonAnyTrue(uint32_t CR) { return (((CR) & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE); } +inline bool XMComparisonAllFalse(uint32_t CR) { return (((CR) & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE); } +inline bool XMComparisonAnyFalse(uint32_t CR) { return (((CR) & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE); } +inline bool XMComparisonMixed(uint32_t CR) { return (((CR) & XM_CRMASK_CR6) == 0); } +inline bool XMComparisonAllInBounds(uint32_t CR) { return (((CR) & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS); } +inline bool XMComparisonAnyOutOfBounds(uint32_t CR) { return (((CR) & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS); } + + +/**************************************************************************** + * + * Data types + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable:4068 4201 4365 4324 4820) +// C4068: ignore unknown pragmas +// C4201: nonstandard extension used : nameless struct/union +// C4365: Off by default noise +// C4324/4820: padding warnings + +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") + +//------------------------------------------------------------------------------ +#if defined(_XM_NO_INTRINSICS_) +struct __vector4 +{ + union + { + float vector4_f32[4]; + uint32_t vector4_u32[4]; + }; +}; +#endif // _XM_NO_INTRINSICS_ + +//------------------------------------------------------------------------------ +// Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte +// boundary and mapped to hardware vector registers +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +typedef __m128 XMVECTOR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +typedef float32x4_t XMVECTOR; +#else +typedef __vector4 XMVECTOR; +#endif + +// Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, ARM64, and vector call; by reference otherwise +#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_) +typedef const XMVECTOR FXMVECTOR; +#else +typedef const XMVECTOR& FXMVECTOR; +#endif + +// Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and x64 vector call; by reference otherwise +#if ( defined(_M_ARM) || defined(_M_ARM64) || (_XM_VECTORCALL_ && !defined(_M_IX86) ) ) && !defined(_XM_NO_INTRINSICS_) +typedef const XMVECTOR GXMVECTOR; +#else +typedef const XMVECTOR& GXMVECTOR; +#endif + +// Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and vector call; by reference otherwise +#if ( defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_) +typedef const XMVECTOR HXMVECTOR; +#else +typedef const XMVECTOR& HXMVECTOR; +#endif + +// Fix-up for (7th+) XMVECTOR parameters to pass by reference +typedef const XMVECTOR& CXMVECTOR; + +//------------------------------------------------------------------------------ +// Conversion types for constants +__declspec(align(16)) struct XMVECTORF32 +{ + union + { + float f[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const { return v; } + inline operator const float*() const { return f; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return _mm_castps_si128(v); } + inline operator __m128d() const { return _mm_castps_pd(v); } +#endif +}; + +__declspec(align(16)) struct XMVECTORI32 +{ + union + { + int32_t i[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const { return v; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return _mm_castps_si128(v); } + inline operator __m128d() const { return _mm_castps_pd(v); } +#endif +}; + +__declspec(align(16)) struct XMVECTORU8 +{ + union + { + uint8_t u[16]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const { return v; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return _mm_castps_si128(v); } + inline operator __m128d() const { return _mm_castps_pd(v); } +#endif +}; + +__declspec(align(16)) struct XMVECTORU32 +{ + union + { + uint32_t u[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const { return v; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return _mm_castps_si128(v); } + inline operator __m128d() const { return _mm_castps_pd(v); } +#endif +}; + +//------------------------------------------------------------------------------ +// Vector operators +XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V); +XMVECTOR XM_CALLCONV operator- (FXMVECTOR V); + +XMVECTOR& XM_CALLCONV operator+= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& XM_CALLCONV operator-= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& XM_CALLCONV operator*= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& XM_CALLCONV operator/= (XMVECTOR& V1, FXMVECTOR V2); + +XMVECTOR& operator*= (XMVECTOR& V, float S); +XMVECTOR& operator/= (XMVECTOR& V, float S); + +XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV operator- (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV operator* (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV operator* (FXMVECTOR V, float S); +XMVECTOR XM_CALLCONV operator* (float S, FXMVECTOR V); +XMVECTOR XM_CALLCONV operator/ (FXMVECTOR V, float S); + +//------------------------------------------------------------------------------ +// Matrix type: Sixteen 32 bit floating point components aligned on a +// 16 byte boundary and mapped to four hardware vector registers + +struct XMMATRIX; + +// Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector call; by reference otherwise +#if ( defined(_M_ARM64) || _XM_VECTORCALL_ ) && !defined(_XM_NO_INTRINSICS_) +typedef const XMMATRIX FXMMATRIX; +#else +typedef const XMMATRIX& FXMMATRIX; +#endif + +// Fix-up for (2nd+) XMMATRIX parameters to pass by reference +typedef const XMMATRIX& CXMMATRIX; + +#ifdef _XM_NO_INTRINSICS_ +struct XMMATRIX +#else +__declspec(align(16)) struct XMMATRIX +#endif +{ +#ifdef _XM_NO_INTRINSICS_ + union + { + XMVECTOR r[4]; + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + float _41, _42, _43, _44; + }; + float m[4][4]; + }; +#else + XMVECTOR r[4]; +#endif + + XMMATRIX() XM_CTOR_DEFAULT +#if defined(_MSC_VER) && _MSC_VER >= 1900 + constexpr XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) : r{ R0,R1,R2,R3 } {} +#else + XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) { r[0] = R0; r[1] = R1; r[2] = R2; r[3] = R3; } +#endif + XMMATRIX(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33); + explicit XMMATRIX(_In_reads_(16) const float *pArray); + +#ifdef _XM_NO_INTRINSICS_ + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } +#endif + + XMMATRIX& operator= (const XMMATRIX& M) { r[0] = M.r[0]; r[1] = M.r[1]; r[2] = M.r[2]; r[3] = M.r[3]; return *this; } + + XMMATRIX operator+ () const { return *this; } + XMMATRIX operator- () const; + + XMMATRIX& XM_CALLCONV operator+= (FXMMATRIX M); + XMMATRIX& XM_CALLCONV operator-= (FXMMATRIX M); + XMMATRIX& XM_CALLCONV operator*= (FXMMATRIX M); + XMMATRIX& operator*= (float S); + XMMATRIX& operator/= (float S); + + XMMATRIX XM_CALLCONV operator+ (FXMMATRIX M) const; + XMMATRIX XM_CALLCONV operator- (FXMMATRIX M) const; + XMMATRIX XM_CALLCONV operator* (FXMMATRIX M) const; + XMMATRIX operator* (float S) const; + XMMATRIX operator/ (float S) const; + + friend XMMATRIX XM_CALLCONV operator* (float S, FXMMATRIX M); +}; + +//------------------------------------------------------------------------------ +// 2D Vector; 32 bit floating point components +struct XMFLOAT2 +{ + float x; + float y; + + XMFLOAT2() XM_CTOR_DEFAULT + XM_CONSTEXPR XMFLOAT2(float _x, float _y) : x(_x), y(_y) {} + explicit XMFLOAT2(_In_reads_(2) const float *pArray) : x(pArray[0]), y(pArray[1]) {} + + XMFLOAT2& operator= (const XMFLOAT2& Float2) { x = Float2.x; y = Float2.y; return *this; } +}; + +// 2D Vector; 32 bit floating point components aligned on a 16 byte boundary +__declspec(align(16)) struct XMFLOAT2A : public XMFLOAT2 +{ + XMFLOAT2A() XM_CTOR_DEFAULT + XM_CONSTEXPR XMFLOAT2A(float _x, float _y) : XMFLOAT2(_x, _y) {} + explicit XMFLOAT2A(_In_reads_(2) const float *pArray) : XMFLOAT2(pArray) {} + + XMFLOAT2A& operator= (const XMFLOAT2A& Float2) { x = Float2.x; y = Float2.y; return *this; } +}; + +//------------------------------------------------------------------------------ +// 2D Vector; 32 bit signed integer components +struct XMINT2 +{ + int32_t x; + int32_t y; + + XMINT2() XM_CTOR_DEFAULT + XM_CONSTEXPR XMINT2(int32_t _x, int32_t _y) : x(_x), y(_y) {} + explicit XMINT2(_In_reads_(2) const int32_t *pArray) : x(pArray[0]), y(pArray[1]) {} + + XMINT2& operator= (const XMINT2& Int2) { x = Int2.x; y = Int2.y; return *this; } +}; + +// 2D Vector; 32 bit unsigned integer components +struct XMUINT2 +{ + uint32_t x; + uint32_t y; + + XMUINT2() XM_CTOR_DEFAULT + XM_CONSTEXPR XMUINT2(uint32_t _x, uint32_t _y) : x(_x), y(_y) {} + explicit XMUINT2(_In_reads_(2) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]) {} + + XMUINT2& operator= (const XMUINT2& UInt2) { x = UInt2.x; y = UInt2.y; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D Vector; 32 bit floating point components +struct XMFLOAT3 +{ + float x; + float y; + float z; + + XMFLOAT3() XM_CTOR_DEFAULT + XM_CONSTEXPR XMFLOAT3(float _x, float _y, float _z) : x(_x), y(_y), z(_z) {} + explicit XMFLOAT3(_In_reads_(3) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + + XMFLOAT3& operator= (const XMFLOAT3& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; } +}; + +// 3D Vector; 32 bit floating point components aligned on a 16 byte boundary +__declspec(align(16)) struct XMFLOAT3A : public XMFLOAT3 +{ + XMFLOAT3A() XM_CTOR_DEFAULT + XM_CONSTEXPR XMFLOAT3A(float _x, float _y, float _z) : XMFLOAT3(_x, _y, _z) {} + explicit XMFLOAT3A(_In_reads_(3) const float *pArray) : XMFLOAT3(pArray) {} + + XMFLOAT3A& operator= (const XMFLOAT3A& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D Vector; 32 bit signed integer components +struct XMINT3 +{ + int32_t x; + int32_t y; + int32_t z; + + XMINT3() XM_CTOR_DEFAULT + XM_CONSTEXPR XMINT3(int32_t _x, int32_t _y, int32_t _z) : x(_x), y(_y), z(_z) {} + explicit XMINT3(_In_reads_(3) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + + XMINT3& operator= (const XMINT3& i3) { x = i3.x; y = i3.y; z = i3.z; return *this; } +}; + +// 3D Vector; 32 bit unsigned integer components +struct XMUINT3 +{ + uint32_t x; + uint32_t y; + uint32_t z; + + XMUINT3() XM_CTOR_DEFAULT + XM_CONSTEXPR XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) : x(_x), y(_y), z(_z) {} + explicit XMUINT3(_In_reads_(3) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + + XMUINT3& operator= (const XMUINT3& u3) { x = u3.x; y = u3.y; z = u3.z; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 32 bit floating point components +struct XMFLOAT4 +{ + float x; + float y; + float z; + float w; + + XMFLOAT4() XM_CTOR_DEFAULT + XM_CONSTEXPR XMFLOAT4(float _x, float _y, float _z, float _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMFLOAT4(_In_reads_(4) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + + XMFLOAT4& operator= (const XMFLOAT4& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; } +}; + +// 4D Vector; 32 bit floating point components aligned on a 16 byte boundary +__declspec(align(16)) struct XMFLOAT4A : public XMFLOAT4 +{ + XMFLOAT4A() XM_CTOR_DEFAULT + XM_CONSTEXPR XMFLOAT4A(float _x, float _y, float _z, float _w) : XMFLOAT4(_x, _y, _z, _w) {} + explicit XMFLOAT4A(_In_reads_(4) const float *pArray) : XMFLOAT4(pArray) {} + + XMFLOAT4A& operator= (const XMFLOAT4A& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 32 bit signed integer components +struct XMINT4 +{ + int32_t x; + int32_t y; + int32_t z; + int32_t w; + + XMINT4() XM_CTOR_DEFAULT + XM_CONSTEXPR XMINT4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMINT4(_In_reads_(4) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + + XMINT4& operator= (const XMINT4& Int4) { x = Int4.x; y = Int4.y; z = Int4.z; w = Int4.w; return *this; } +}; + +// 4D Vector; 32 bit unsigned integer components +struct XMUINT4 +{ + uint32_t x; + uint32_t y; + uint32_t z; + uint32_t w; + + XMUINT4() XM_CTOR_DEFAULT + XM_CONSTEXPR XMUINT4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUINT4(_In_reads_(4) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + + XMUINT4& operator= (const XMUINT4& UInt4) { x = UInt4.x; y = UInt4.y; z = UInt4.z; w = UInt4.w; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3x3 Matrix: 32 bit floating point components +struct XMFLOAT3X3 +{ + union + { + struct + { + float _11, _12, _13; + float _21, _22, _23; + float _31, _32, _33; + }; + float m[3][3]; + }; + + XMFLOAT3X3() XM_CTOR_DEFAULT + XM_CONSTEXPR XMFLOAT3X3(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22) + : _11(m00), _12(m01), _13(m02), + _21(m10), _22(m11), _23(m12), + _31(m20), _32(m21), _33(m22) {} + explicit XMFLOAT3X3(_In_reads_(9) const float *pArray); + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } + + XMFLOAT3X3& operator= (const XMFLOAT3X3& Float3x3); +}; + +//------------------------------------------------------------------------------ +// 4x3 Matrix: 32 bit floating point components +struct XMFLOAT4X3 +{ + union + { + struct + { + float _11, _12, _13; + float _21, _22, _23; + float _31, _32, _33; + float _41, _42, _43; + }; + float m[4][3]; + }; + + XMFLOAT4X3() XM_CTOR_DEFAULT + XM_CONSTEXPR XMFLOAT4X3(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22, + float m30, float m31, float m32) + : _11(m00), _12(m01), _13(m02), + _21(m10), _22(m11), _23(m12), + _31(m20), _32(m21), _33(m22), + _41(m30), _42(m31), _43(m32) {} + explicit XMFLOAT4X3(_In_reads_(12) const float *pArray); + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } + + XMFLOAT4X3& operator= (const XMFLOAT4X3& Float4x3); + +}; + +// 4x3 Matrix: 32 bit floating point components aligned on a 16 byte boundary +__declspec(align(16)) struct XMFLOAT4X3A : public XMFLOAT4X3 +{ + XMFLOAT4X3A() XM_CTOR_DEFAULT + XM_CONSTEXPR XMFLOAT4X3A(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22, + float m30, float m31, float m32) : + XMFLOAT4X3(m00,m01,m02,m10,m11,m12,m20,m21,m22,m30,m31,m32) {} + explicit XMFLOAT4X3A(_In_reads_(12) const float *pArray) : XMFLOAT4X3(pArray) {} + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } + + XMFLOAT4X3A& operator= (const XMFLOAT4X3A& Float4x3); +}; + +//------------------------------------------------------------------------------ +// 4x4 Matrix: 32 bit floating point components +struct XMFLOAT4X4 +{ + union + { + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + float _41, _42, _43, _44; + }; + float m[4][4]; + }; + + XMFLOAT4X4() XM_CTOR_DEFAULT + XM_CONSTEXPR XMFLOAT4X4(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) + : _11(m00), _12(m01), _13(m02), _14(m03), + _21(m10), _22(m11), _23(m12), _24(m13), + _31(m20), _32(m21), _33(m22), _34(m23), + _41(m30), _42(m31), _43(m32), _44(m33) {} + explicit XMFLOAT4X4(_In_reads_(16) const float *pArray); + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } + + XMFLOAT4X4& operator= (const XMFLOAT4X4& Float4x4); +}; + +// 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary +__declspec(align(16)) struct XMFLOAT4X4A : public XMFLOAT4X4 +{ + XMFLOAT4X4A() XM_CTOR_DEFAULT + XM_CONSTEXPR XMFLOAT4X4A(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) + : XMFLOAT4X4(m00,m01,m02,m03,m10,m11,m12,m13,m20,m21,m22,m23,m30,m31,m32,m33) {} + explicit XMFLOAT4X4A(_In_reads_(16) const float *pArray) : XMFLOAT4X4(pArray) {} + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } + + XMFLOAT4X4A& operator= (const XMFLOAT4X4A& Float4x4); +}; + +//////////////////////////////////////////////////////////////////////////////// + +#pragma prefast(pop) +#pragma warning(pop) + +/**************************************************************************** + * + * Data conversion operations + * + ****************************************************************************/ + +XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat(FXMVECTOR VInt, uint32_t DivExponent); +XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt(FXMVECTOR VFloat, uint32_t MulExponent); +XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat(FXMVECTOR VUInt, uint32_t DivExponent); +XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt(FXMVECTOR VFloat, uint32_t MulExponent); + +#if defined(__XNAMATH_H__) && defined(XMVectorSetBinaryConstant) +#undef XMVectorSetBinaryConstant +#undef XMVectorSplatConstant +#undef XMVectorSplatConstantInt +#endif + +XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3); +XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent); +XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant); + +/**************************************************************************** + * + * Load operations + * + ****************************************************************************/ + +XMVECTOR XM_CALLCONV XMLoadInt(_In_ const uint32_t* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat(_In_ const float* pSource); + +XMVECTOR XM_CALLCONV XMLoadInt2(_In_reads_(2) const uint32_t* pSource); +XMVECTOR XM_CALLCONV XMLoadInt2A(_In_reads_(2) const uint32_t* PSource); +XMVECTOR XM_CALLCONV XMLoadFloat2(_In_ const XMFLOAT2* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat2A(_In_ const XMFLOAT2A* pSource); +XMVECTOR XM_CALLCONV XMLoadSInt2(_In_ const XMINT2* pSource); +XMVECTOR XM_CALLCONV XMLoadUInt2(_In_ const XMUINT2* pSource); + +XMVECTOR XM_CALLCONV XMLoadInt3(_In_reads_(3) const uint32_t* pSource); +XMVECTOR XM_CALLCONV XMLoadInt3A(_In_reads_(3) const uint32_t* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat3(_In_ const XMFLOAT3* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat3A(_In_ const XMFLOAT3A* pSource); +XMVECTOR XM_CALLCONV XMLoadSInt3(_In_ const XMINT3* pSource); +XMVECTOR XM_CALLCONV XMLoadUInt3(_In_ const XMUINT3* pSource); + +XMVECTOR XM_CALLCONV XMLoadInt4(_In_reads_(4) const uint32_t* pSource); +XMVECTOR XM_CALLCONV XMLoadInt4A(_In_reads_(4) const uint32_t* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat4(_In_ const XMFLOAT4* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat4A(_In_ const XMFLOAT4A* pSource); +XMVECTOR XM_CALLCONV XMLoadSInt4(_In_ const XMINT4* pSource); +XMVECTOR XM_CALLCONV XMLoadUInt4(_In_ const XMUINT4* pSource); + +XMMATRIX XM_CALLCONV XMLoadFloat3x3(_In_ const XMFLOAT3X3* pSource); +XMMATRIX XM_CALLCONV XMLoadFloat4x3(_In_ const XMFLOAT4X3* pSource); +XMMATRIX XM_CALLCONV XMLoadFloat4x3A(_In_ const XMFLOAT4X3A* pSource); +XMMATRIX XM_CALLCONV XMLoadFloat4x4(_In_ const XMFLOAT4X4* pSource); +XMMATRIX XM_CALLCONV XMLoadFloat4x4A(_In_ const XMFLOAT4X4A* pSource); + +/**************************************************************************** + * + * Store operations + * + ****************************************************************************/ + +void XM_CALLCONV XMStoreInt(_Out_ uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat(_Out_ float* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreInt2(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreInt2A(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat2(_Out_ XMFLOAT2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreSInt2(_Out_ XMINT2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUInt2(_Out_ XMUINT2* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreInt3(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreInt3A(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat3(_Out_ XMFLOAT3* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreSInt3(_Out_ XMINT3* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUInt3(_Out_ XMUINT3* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreInt4(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreInt4A(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat4(_Out_ XMFLOAT4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreSInt4(_Out_ XMINT4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUInt4(_Out_ XMUINT4* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, _In_ FXMMATRIX M); +void XM_CALLCONV XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, _In_ FXMMATRIX M); +void XM_CALLCONV XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, _In_ FXMMATRIX M); +void XM_CALLCONV XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, _In_ FXMMATRIX M); +void XM_CALLCONV XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, _In_ FXMMATRIX M); + +/**************************************************************************** + * + * General vector operations + * + ****************************************************************************/ + +XMVECTOR XM_CALLCONV XMVectorZero(); +XMVECTOR XM_CALLCONV XMVectorSet(float x, float y, float z, float w); +XMVECTOR XM_CALLCONV XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w); +XMVECTOR XM_CALLCONV XMVectorReplicate(float Value); +XMVECTOR XM_CALLCONV XMVectorReplicatePtr(_In_ const float *pValue); +XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value); +XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr(_In_ const uint32_t *pValue); +XMVECTOR XM_CALLCONV XMVectorTrueInt(); +XMVECTOR XM_CALLCONV XMVectorFalseInt(); +XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSplatOne(); +XMVECTOR XM_CALLCONV XMVectorSplatInfinity(); +XMVECTOR XM_CALLCONV XMVectorSplatQNaN(); +XMVECTOR XM_CALLCONV XMVectorSplatEpsilon(); +XMVECTOR XM_CALLCONV XMVectorSplatSignMask(); + +float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i); +float XM_CALLCONV XMVectorGetX(FXMVECTOR V); +float XM_CALLCONV XMVectorGetY(FXMVECTOR V); +float XM_CALLCONV XMVectorGetZ(FXMVECTOR V); +float XM_CALLCONV XMVectorGetW(FXMVECTOR V); + +void XM_CALLCONV XMVectorGetByIndexPtr(_Out_ float *f, _In_ FXMVECTOR V, _In_ size_t i); +void XM_CALLCONV XMVectorGetXPtr(_Out_ float *x, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V); + +uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i); +uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V); +uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V); +uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V); +uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V); + +void XM_CALLCONV XMVectorGetIntByIndexPtr(_Out_ uint32_t *x, _In_ FXMVECTOR V, _In_ size_t i); +void XM_CALLCONV XMVectorGetIntXPtr(_Out_ uint32_t *x, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V); + +XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V,float f, size_t i); +XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x); +XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y); +XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z); +XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w); + +XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(_In_ FXMVECTOR V, _In_ const float *f, _In_ size_t i); +XMVECTOR XM_CALLCONV XMVectorSetXPtr(_In_ FXMVECTOR V, _In_ const float *x); +XMVECTOR XM_CALLCONV XMVectorSetYPtr(_In_ FXMVECTOR V, _In_ const float *y); +XMVECTOR XM_CALLCONV XMVectorSetZPtr(_In_ FXMVECTOR V, _In_ const float *z); +XMVECTOR XM_CALLCONV XMVectorSetWPtr(_In_ FXMVECTOR V, _In_ const float *w); + +XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i); +XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x); +XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y); +XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z); +XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w); + +XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(_In_ FXMVECTOR V, _In_ const uint32_t *x, _In_ size_t i); +XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(_In_ FXMVECTOR V, _In_ const uint32_t *x); +XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(_In_ FXMVECTOR V, _In_ const uint32_t *y); +XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(_In_ FXMVECTOR V, _In_ const uint32_t *z); +XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(_In_ FXMVECTOR V, _In_ const uint32_t *w); + +#if defined(__XNAMATH_H__) && defined(XMVectorSwizzle) +#undef XMVectorSwizzle +#endif + +XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3); +XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW); +XMVECTOR XM_CALLCONV XMVectorSelectControl(uint32_t VectorIndex0, uint32_t VectorIndex1, uint32_t VectorIndex2, uint32_t VectorIndex3); +XMVECTOR XM_CALLCONV XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control); +XMVECTOR XM_CALLCONV XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2); + +#if defined(__XNAMATH_H__) && defined(XMVectorShiftLeft) +#undef XMVectorShiftLeft +#undef XMVectorRotateLeft +#undef XMVectorRotateRight +#undef XMVectorInsert +#endif + +XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements); +XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements); +XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements); +XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements, + uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3); + +XMVECTOR XM_CALLCONV XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorEqualIntR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +XMVECTOR XM_CALLCONV XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorGreaterR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorLess(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds); +XMVECTOR XM_CALLCONV XMVectorInBoundsR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR Bounds); + +XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V); + +XMVECTOR XM_CALLCONV XMVectorMin(FXMVECTOR V1,FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorMax(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max); +XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V); + +XMVECTOR XM_CALLCONV XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2); + +XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); +XMVECTOR XM_CALLCONV XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); +XMVECTOR XM_CALLCONV XMVectorScale(FXMVECTOR V, float ScaleFactor); +XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorPow(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorMod(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles); +XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V); +void XM_CALLCONV XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V); +void XM_CALLCONV XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVectorATan2(FXMVECTOR Y, FXMVECTOR X); +XMVECTOR XM_CALLCONV XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X); +XMVECTOR XM_CALLCONV XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, float t); +XMVECTOR XM_CALLCONV XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T); +XMVECTOR XM_CALLCONV XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, float t); +XMVECTOR XM_CALLCONV XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, HXMVECTOR T); +XMVECTOR XM_CALLCONV XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, float t); +XMVECTOR XM_CALLCONV XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, HXMVECTOR T); +XMVECTOR XM_CALLCONV XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, float f, float g); +XMVECTOR XM_CALLCONV XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR F, HXMVECTOR G); + +/**************************************************************************** + * + * 2D vector operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +bool XM_CALLCONV XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2Less(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds); + +bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V); +bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V); + +XMVECTOR XM_CALLCONV XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2ClampLength(FXMVECTOR V, float LengthMin, float LengthMax); +XMVECTOR XM_CALLCONV XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); +XMVECTOR XM_CALLCONV XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal); +XMVECTOR XM_CALLCONV XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex); +XMVECTOR XM_CALLCONV XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); +XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point); +XMVECTOR XM_CALLCONV XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, GXMVECTOR Line2Point2); +XMVECTOR XM_CALLCONV XMVector2Transform(FXMVECTOR V, FXMMATRIX M); +XMFLOAT4* XM_CALLCONV XMVector2TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); +XMVECTOR XM_CALLCONV XMVector2TransformCoord(FXMVECTOR V, FXMMATRIX M); +XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); +XMVECTOR XM_CALLCONV XMVector2TransformNormal(FXMVECTOR V, FXMMATRIX M); +XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); + +/**************************************************************************** + * + * 3D vector operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +bool XM_CALLCONV XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3Less(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds); + +bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V); +bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V); + +XMVECTOR XM_CALLCONV XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3ClampLength(FXMVECTOR V, float LengthMin, float LengthMax); +XMVECTOR XM_CALLCONV XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); +XMVECTOR XM_CALLCONV XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal); +XMVECTOR XM_CALLCONV XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex); +XMVECTOR XM_CALLCONV XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); +XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point); +void XM_CALLCONV XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel, _Out_ XMVECTOR* pPerpendicular, _In_ FXMVECTOR V, _In_ FXMVECTOR Normal); +XMVECTOR XM_CALLCONV XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion); +XMVECTOR XM_CALLCONV XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion); +XMVECTOR XM_CALLCONV XMVector3Transform(FXMVECTOR V, FXMMATRIX M); +XMFLOAT4* XM_CALLCONV XMVector3TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); +XMVECTOR XM_CALLCONV XMVector3TransformCoord(FXMVECTOR V, FXMMATRIX M); +XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); +XMVECTOR XM_CALLCONV XMVector3TransformNormal(FXMVECTOR V, FXMMATRIX M); +XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); +XMVECTOR XM_CALLCONV XMVector3Project(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, + FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World); +XMFLOAT3* XM_CALLCONV XMVector3ProjectStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, + _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World); +XMVECTOR XM_CALLCONV XMVector3Unproject(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, + FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World); +XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, + _In_ FXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World); + +/**************************************************************************** + * + * 4D vector operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +bool XM_CALLCONV XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XM_CALLCONV XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4Less(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XM_CALLCONV XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds); + +bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V); +bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V); + +XMVECTOR XM_CALLCONV XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); +XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4ClampLength(FXMVECTOR V, float LengthMin, float LengthMax); +XMVECTOR XM_CALLCONV XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); +XMVECTOR XM_CALLCONV XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal); +XMVECTOR XM_CALLCONV XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex); +XMVECTOR XM_CALLCONV XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); +XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V); +XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XM_CALLCONV XMVector4Transform(FXMVECTOR V, FXMMATRIX M); +XMFLOAT4* XM_CALLCONV XMVector4TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT4)+InputStride*(VectorCount-1)) const XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ FXMMATRIX M); + +/**************************************************************************** + * + * Matrix operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M); +bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M); +bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M); + +XMMATRIX XM_CALLCONV XMMatrixMultiply(FXMMATRIX M1, CXMMATRIX M2); +XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose(FXMMATRIX M1, CXMMATRIX M2); +XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M); +XMMATRIX XM_CALLCONV XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant, _In_ FXMMATRIX M); +XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M); +_Success_(return) +bool XM_CALLCONV XMMatrixDecompose(_Out_ XMVECTOR *outScale, _Out_ XMVECTOR *outRotQuat, _Out_ XMVECTOR *outTrans, _In_ FXMMATRIX M); + +XMMATRIX XM_CALLCONV XMMatrixIdentity(); +XMMATRIX XM_CALLCONV XMMatrixSet(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33); +XMMATRIX XM_CALLCONV XMMatrixTranslation(float OffsetX, float OffsetY, float OffsetZ); +XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset); +XMMATRIX XM_CALLCONV XMMatrixScaling(float ScaleX, float ScaleY, float ScaleZ); +XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale); +XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle); +XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle); +XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle); +XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw(float Pitch, float Yaw, float Roll); +XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles); +XMMATRIX XM_CALLCONV XMMatrixRotationNormal(FXMVECTOR NormalAxis, float Angle); +XMMATRIX XM_CALLCONV XMMatrixRotationAxis(FXMVECTOR Axis, float Angle); +XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion); +XMMATRIX XM_CALLCONV XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation); +XMMATRIX XM_CALLCONV XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling, + GXMVECTOR RotationOrigin, HXMVECTOR RotationQuaternion, HXMVECTOR Translation); +XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, float Rotation, FXMVECTOR Translation); +XMMATRIX XM_CALLCONV XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, GXMVECTOR Translation); +XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane); +XMMATRIX XM_CALLCONV XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition); + +XMMATRIX XM_CALLCONV XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection); +XMMATRIX XM_CALLCONV XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection); +XMMATRIX XM_CALLCONV XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection); +XMMATRIX XM_CALLCONV XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection); +XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH(float FovAngleY, float AspectRatio, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH(float FovAngleY, float AspectRatio, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixOrthographicLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixOrthographicRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); +XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); + + +/**************************************************************************** + * + * Quaternion operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2); +bool XM_CALLCONV XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2); + +bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q); +bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q); +bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q); + +XMVECTOR XM_CALLCONV XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2); +XMVECTOR XM_CALLCONV XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2); +XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q); +XMVECTOR XM_CALLCONV XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, float t); +XMVECTOR XM_CALLCONV XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T); +XMVECTOR XM_CALLCONV XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, float t); +XMVECTOR XM_CALLCONV XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, HXMVECTOR T); +void XM_CALLCONV XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB, _Out_ XMVECTOR* pC, _In_ FXMVECTOR Q0, _In_ FXMVECTOR Q1, _In_ FXMVECTOR Q2, _In_ GXMVECTOR Q3); +XMVECTOR XM_CALLCONV XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, float f, float g); +XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR F, HXMVECTOR G); + +XMVECTOR XM_CALLCONV XMQuaternionIdentity(); +XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw, float Roll); +XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles); +XMVECTOR XM_CALLCONV XMQuaternionRotationNormal(FXMVECTOR NormalAxis, float Angle); +XMVECTOR XM_CALLCONV XMQuaternionRotationAxis(FXMVECTOR Axis, float Angle); +XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M); + +void XM_CALLCONV XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis, _Out_ float* pAngle, _In_ FXMVECTOR Q); + +/**************************************************************************** + * + * Plane operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2); +bool XM_CALLCONV XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon); +bool XM_CALLCONV XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2); + +bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P); +bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P); + +XMVECTOR XM_CALLCONV XMPlaneDot(FXMVECTOR P, FXMVECTOR V); +XMVECTOR XM_CALLCONV XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V); +XMVECTOR XM_CALLCONV XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V); +XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P); +XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P); +XMVECTOR XM_CALLCONV XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2); +void XM_CALLCONV XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1, _Out_ XMVECTOR* pLinePoint2, _In_ FXMVECTOR P1, _In_ FXMVECTOR P2); +XMVECTOR XM_CALLCONV XMPlaneTransform(FXMVECTOR P, FXMMATRIX M); +XMFLOAT4* XM_CALLCONV XMPlaneTransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(PlaneCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT4)+InputStride*(PlaneCount-1)) const XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t PlaneCount, _In_ FXMMATRIX M); + +XMVECTOR XM_CALLCONV XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal); +XMVECTOR XM_CALLCONV XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3); + +/**************************************************************************** + * + * Color operations + * + ****************************************************************************/ + +bool XM_CALLCONV XMColorEqual(FXMVECTOR C1, FXMVECTOR C2); +bool XM_CALLCONV XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2); +bool XM_CALLCONV XMColorGreater(FXMVECTOR C1, FXMVECTOR C2); +bool XM_CALLCONV XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2); +bool XM_CALLCONV XMColorLess(FXMVECTOR C1, FXMVECTOR C2); +bool XM_CALLCONV XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2); + +bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C); +bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C); + +XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR C); +XMVECTOR XM_CALLCONV XMColorModulate(FXMVECTOR C1, FXMVECTOR C2); +XMVECTOR XM_CALLCONV XMColorAdjustSaturation(FXMVECTOR C, float Saturation); +XMVECTOR XM_CALLCONV XMColorAdjustContrast(FXMVECTOR C, float Contrast); + +XMVECTOR XM_CALLCONV XMColorRGBToHSL( FXMVECTOR rgb ); +XMVECTOR XM_CALLCONV XMColorHSLToRGB( FXMVECTOR hsl ); + +XMVECTOR XM_CALLCONV XMColorRGBToHSV( FXMVECTOR rgb ); +XMVECTOR XM_CALLCONV XMColorHSVToRGB( FXMVECTOR hsv ); + +XMVECTOR XM_CALLCONV XMColorRGBToYUV( FXMVECTOR rgb ); +XMVECTOR XM_CALLCONV XMColorYUVToRGB( FXMVECTOR yuv ); + +XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD( FXMVECTOR rgb ); +XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD( FXMVECTOR yuv ); + +XMVECTOR XM_CALLCONV XMColorRGBToXYZ( FXMVECTOR rgb ); +XMVECTOR XM_CALLCONV XMColorXYZToRGB( FXMVECTOR xyz ); + +XMVECTOR XM_CALLCONV XMColorXYZToSRGB( FXMVECTOR xyz ); +XMVECTOR XM_CALLCONV XMColorSRGBToXYZ( FXMVECTOR srgb ); + +XMVECTOR XM_CALLCONV XMColorRGBToSRGB( FXMVECTOR rgb ); +XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb ); + + +/**************************************************************************** + * + * Miscellaneous operations + * + ****************************************************************************/ + +bool XMVerifyCPUSupport(); + +XMVECTOR XM_CALLCONV XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex); + +bool XMScalarNearEqual(float S1, float S2, float Epsilon); +float XMScalarModAngle(float Value); + +float XMScalarSin(float Value); +float XMScalarSinEst(float Value); + +float XMScalarCos(float Value); +float XMScalarCosEst(float Value); + +void XMScalarSinCos(_Out_ float* pSin, _Out_ float* pCos, float Value); +void XMScalarSinCosEst(_Out_ float* pSin, _Out_ float* pCos, float Value); + +float XMScalarASin(float Value); +float XMScalarASinEst(float Value); + +float XMScalarACos(float Value); +float XMScalarACosEst(float Value); + +/**************************************************************************** + * + * Templates + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XMMin) +#undef XMMin +#undef XMMax +#endif + +template inline T XMMin(T a, T b) { return (a < b) ? a : b; } +template inline T XMMax(T a, T b) { return (a > b) ? a : b; } + +//------------------------------------------------------------------------------ + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +// PermuteHelper internal template (SSE only) +namespace Internal +{ + // Slow path fallback for permutes that do not map to a single SSE shuffle opcode. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) + { + static const XMVECTORU32 selectMask = + { + WhichX ? 0xFFFFFFFF : 0, + WhichY ? 0xFFFFFFFF : 0, + WhichZ ? 0xFFFFFFFF : 0, + WhichW ? 0xFFFFFFFF : 0, + }; + + XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle); + XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle); + + XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); + XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); + + return _mm_or_ps(masked1, masked2); + } + }; + + // Fast path for permutes that only read from the first vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return XM_PERMUTE_PS(v1, Shuffle); } + }; + + // Fast path for permutes that only read from the second vector. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return XM_PERMUTE_PS(v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the first vector, ZW from the second. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the second vector, ZW from the first. + template struct PermuteHelper + { + static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); } + }; +}; + +#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +// General permute template +template + inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); + static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); + static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); + static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); + + const bool WhichX = PermuteX > 3; + const bool WhichY = PermuteY > 3; + const bool WhichZ = PermuteZ > 3; + const bool WhichW = PermuteW > 3; + + return Internal::PermuteHelper::Permute(V1, V2); +#else + + return XMVectorPermute( V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW ); + +#endif +} + +// Special-case permute templates +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; } + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_movelh_ps(V1,V2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<6,7,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_movehl_ps(V1,V2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,4,1,5>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_unpacklo_ps(V1,V2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,6,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_unpackhi_ps(V1,V2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(V1), _mm_castps_pd(V2))); } +#endif + +#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x1); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x3); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x4); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x5); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x6); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,6,3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x7); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x8); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0x9); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xA); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,5,2,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xB); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xC); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xD); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1,V2,0xE); } +#endif + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +// If the indices are all in the range 0-3 or 4-7, then use XMVectorSwizzle instead +// The mirror cases are not spelled out here as the programmer can always swap the arguments +// (i.e. prefer permutes where the X element comes from the V1 vector instead of the V2 vector) + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_low_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,0,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_low_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,0,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vget_high_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,2,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_high_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,2,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_high_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,0,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_high_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,1,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,0,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,2,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_low_f32(V2) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,2,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,4,2,6>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[0]; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,5,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[1]; } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,4,1,5>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[0]; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,6,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[1]; } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0,2,4,6>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[0]; } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,3,5,7>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[1]; } + +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<1,2,3,4>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 1); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<2,3,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<3,4,5,6>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 3); } + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +//------------------------------------------------------------------------------ + +// General swizzle template +template + inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) +{ + static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); + static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); + static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); + static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) ); +#else + + return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW ); + +#endif +} + +// Specialized swizzles +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; } + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,0,1>(FXMVECTOR V) { return _mm_movelh_ps(V,V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,3,2,3>(FXMVECTOR V) { return _mm_movehl_ps(V,V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,1,1>(FXMVECTOR V) { return _mm_unpacklo_ps(V,V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,2,3,3>(FXMVECTOR V) { return _mm_unpackhi_ps(V,V); } +#endif + +#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return _mm_moveldup_ps(V); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); } +#endif + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,1,1>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 1); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,2,2,2>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 0); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,3,3,3>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 1); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,0,3,2>(FXMVECTOR V) { return vrev64q_f32(V); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,0,1>(FXMVECTOR V) { float32x2_t vt = vget_low_f32(V); return vcombine_f32( vt, vt ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,3,2,3>(FXMVECTOR V) { float32x2_t vt = vget_high_f32(V); return vcombine_f32( vt, vt ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,0,1,0>(FXMVECTOR V) { float32x2_t vt = vrev64_f32( vget_low_f32(V) ); return vcombine_f32( vt, vt ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,2,3,2>(FXMVECTOR V) { float32x2_t vt = vrev64_f32( vget_high_f32(V) ); return vcombine_f32( vt, vt ); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,1,3,2>(FXMVECTOR V) { return vcombine_f32( vget_low_f32(V), vrev64_f32( vget_high_f32(V) ) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,0,2,3>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_low_f32(V) ), vget_high_f32(V) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,3,1,0>(FXMVECTOR V) { return vcombine_f32( vget_high_f32(V), vrev64_f32( vget_low_f32(V) ) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,2,0,1>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vget_low_f32(V) ); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,2,1,0>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vrev64_f32( vget_low_f32(V) ) ); } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return vtrnq_f32(V,V).val[0]; } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return vtrnq_f32(V,V).val[1]; } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,1,1>(FXMVECTOR V) { return vzipq_f32(V,V).val[0]; } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,2,3,3>(FXMVECTOR V) { return vzipq_f32(V,V).val[1]; } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,2,0,2>(FXMVECTOR V) { return vuzpq_f32(V,V).val[0]; } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,3,1,3>(FXMVECTOR V) { return vuzpq_f32(V,V).val[1]; } + +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,2,3,0>(FXMVECTOR V) { return vextq_f32(V, V, 1); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2,3,0,1>(FXMVECTOR V) { return vextq_f32(V, V, 2); } +template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3,0,1,2>(FXMVECTOR V) { return vextq_f32(V, V, 3); } + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +//------------------------------------------------------------------------------ + +template + inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return XMVectorPermute(V1, V2); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return XMVectorSwizzle(V); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); + return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); +} + +template + inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS) +{ + XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1); + return XMVectorSelect( VD, XMVectorRotateLeft(VS), Control ); +} + +/**************************************************************************** + * + * Globals + * + ****************************************************************************/ + +// The purpose of the following global constants is to prevent redundant +// reloading of the constants when they are referenced by more than one +// separate inline math routine called within the same function. Declaring +// a constant locally within a routine is sufficient to prevent redundant +// reloads of that constant when that single routine is called multiple +// times in a function, but if the constant is used (and declared) in a +// separate math routine it would be reloaded. + +#ifndef XMGLOBALCONST +#define XMGLOBALCONST extern const __declspec(selectany) +#endif + +XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0 = {-0.16666667f, +0.0083333310f, -0.00019840874f, +2.7525562e-06f}; +XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1 = {-2.3889859e-08f, -0.16665852f /*Est1*/, +0.0083139502f /*Est2*/, -0.00018524670f /*Est3*/}; +XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0 = {-0.5f, +0.041666638f, -0.0013888378f, +2.4760495e-05f}; +XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1 = {-2.6051615e-07f, -0.49992746f /*Est1*/, +0.041493919f /*Est2*/, -0.0012712436f /*Est3*/}; +XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0 = {1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f}; +XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1 = {2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f}; +XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2 = {5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f}; +XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients0 = {+1.5707963050f, -0.2145988016f, +0.0889789874f, -0.0501743046f}; +XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients1 = {+0.0308918810f, -0.0170881256f, +0.0066700901f, -0.0012624911f}; +XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0 = {-0.3333314528f, +0.1999355085f, -0.1420889944f, +0.1065626393f}; +XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1 = {-0.0752896400f, +0.0429096138f, -0.0161657367f, +0.0028662257f}; +XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients0 = {+0.999866f, +0.999866f, +0.999866f, +0.999866f}; +XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients1 = {-0.3302995f, +0.180141f, -0.085133f, +0.0208351f}; +XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients = {2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI}; +XMGLOBALCONST XMVECTORF32 g_XMArcEstCoefficients = {+1.5707288f,-0.2121144f,+0.0742610f,-0.0187293f}; +XMGLOBALCONST XMVECTORF32 g_XMPiConstants0 = {XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI}; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR0 = {1.0f, 0.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR1 = {0.0f, 1.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR2 = {0.0f, 0.0f, 1.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR3 = {0.0f, 0.0f, 0.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = {-1.0f,0.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = {0.0f,-1.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = {0.0f, 0.0f,-1.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = {0.0f, 0.0f, 0.0f,-1.0f}; +XMGLOBALCONST XMVECTORU32 g_XMNegativeZero = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; +XMGLOBALCONST XMVECTORU32 g_XMNegate3 = {0x80000000, 0x80000000, 0x80000000, 0x00000000}; +XMGLOBALCONST XMVECTORU32 g_XMMaskXY = {0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORU32 g_XMMask3 = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000}; +XMGLOBALCONST XMVECTORU32 g_XMMaskX = {0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORU32 g_XMMaskY = {0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORU32 g_XMMaskZ = {0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000}; +XMGLOBALCONST XMVECTORU32 g_XMMaskW = {0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF}; +XMGLOBALCONST XMVECTORF32 g_XMOne = { 1.0f, 1.0f, 1.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMOne3 = { 1.0f, 1.0f, 1.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMZero = { 0.0f, 0.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMTwo = { 2.f, 2.f, 2.f, 2.f }; +XMGLOBALCONST XMVECTORF32 g_XMFour = { 4.f, 4.f, 4.f, 4.f }; +XMGLOBALCONST XMVECTORF32 g_XMSix = { 6.f, 6.f, 6.f, 6.f }; +XMGLOBALCONST XMVECTORF32 g_XMNegativeOne = {-1.0f,-1.0f,-1.0f,-1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMOneHalf = { 0.5f, 0.5f, 0.5f, 0.5f}; +XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf = {-0.5f,-0.5f,-0.5f,-0.5f}; +XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi = {-XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI}; +XMGLOBALCONST XMVECTORF32 g_XMNegativePi = {-XM_PI, -XM_PI, -XM_PI, -XM_PI}; +XMGLOBALCONST XMVECTORF32 g_XMHalfPi = {XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2}; +XMGLOBALCONST XMVECTORF32 g_XMPi = {XM_PI, XM_PI, XM_PI, XM_PI}; +XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi = {XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI}; +XMGLOBALCONST XMVECTORF32 g_XMTwoPi = {XM_2PI, XM_2PI, XM_2PI, XM_2PI}; +XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi = {XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI}; +XMGLOBALCONST XMVECTORF32 g_XMEpsilon = {1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f}; +XMGLOBALCONST XMVECTORI32 g_XMInfinity = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; +XMGLOBALCONST XMVECTORI32 g_XMQNaN = {0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000}; +XMGLOBALCONST XMVECTORI32 g_XMQNaNTest = {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; +XMGLOBALCONST XMVECTORI32 g_XMAbsMask = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; +XMGLOBALCONST XMVECTORI32 g_XMFltMin = {0x00800000, 0x00800000, 0x00800000, 0x00800000}; +XMGLOBALCONST XMVECTORI32 g_XMFltMax = {0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF}; +XMGLOBALCONST XMVECTORU32 g_XMNegOneMask = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}; +XMGLOBALCONST XMVECTORU32 g_XMMaskA8R8G8B8 = {0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000}; +XMGLOBALCONST XMVECTORU32 g_XMFlipA8R8G8B8 = {0x00000000, 0x00000000, 0x00000000, 0x80000000}; +XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8 = {0.0f,0.0f,0.0f,(float)(0x80000000U)}; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = {1.0f/(255.0f*(float)(0x10000)),1.0f/(255.0f*(float)(0x100)),1.0f/255.0f,1.0f/(255.0f*(float)(0x1000000))}; +XMGLOBALCONST XMVECTORU32 g_XMMaskA2B10G10R10 = {0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000}; +XMGLOBALCONST XMVECTORU32 g_XMFlipA2B10G10R10 = {0x00000200, 0x00080000, 0x20000000, 0x80000000}; +XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10 = {-512.0f,-512.0f*(float)(0x400),-512.0f*(float)(0x100000),(float)(0x80000000U)}; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = {1.0f/511.0f,1.0f/(511.0f*(float)(0x400)),1.0f/(511.0f*(float)(0x100000)),1.0f/(3.0f*(float)(0x40000000))}; +XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16 = {0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16 = {0x00008000, 0x00000000, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16 = {-32768.0f,0.0f,0.0f,0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16 = {1.0f/32767.0f,1.0f/(32767.0f*65536.0f),0.0f,0.0f}; +XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16Z16W16 = {0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16 = {0x00008000, 0x00008000, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16 = {-32768.0f,-32768.0f,0.0f,0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = {1.0f/32767.0f,1.0f/32767.0f,1.0f/(32767.0f*65536.0f),1.0f/(32767.0f*65536.0f)}; +XMGLOBALCONST XMVECTORF32 g_XMNoFraction = {8388608.0f,8388608.0f,8388608.0f,8388608.0f}; +XMGLOBALCONST XMVECTORI32 g_XMMaskByte = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF}; +XMGLOBALCONST XMVECTORF32 g_XMNegateX = {-1.0f, 1.0f, 1.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegateY = { 1.0f,-1.0f, 1.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegateZ = { 1.0f, 1.0f,-1.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegateW = { 1.0f, 1.0f, 1.0f,-1.0f}; +XMGLOBALCONST XMVECTORU32 g_XMSelect0101 = {XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1}; +XMGLOBALCONST XMVECTORU32 g_XMSelect1010 = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0}; +XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD}; +XMGLOBALCONST XMVECTORU32 g_XMSelect1000 = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0}; +XMGLOBALCONST XMVECTORU32 g_XMSelect1100 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0}; +XMGLOBALCONST XMVECTORU32 g_XMSelect1110 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0}; +XMGLOBALCONST XMVECTORU32 g_XMSelect1011 = { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 }; +XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = {1.0f,1.0f/65536.0f,0.0f,0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = {1.0f,1.0f,1.0f/65536.0f,1.0f/65536.0f}; +XMGLOBALCONST XMVECTORU32 g_XMFlipY = {0,0x80000000,0,0}; +XMGLOBALCONST XMVECTORU32 g_XMFlipZ = {0,0,0x80000000,0}; +XMGLOBALCONST XMVECTORU32 g_XMFlipW = {0,0,0,0x80000000}; +XMGLOBALCONST XMVECTORU32 g_XMFlipYZ = {0,0x80000000,0x80000000,0}; +XMGLOBALCONST XMVECTORU32 g_XMFlipZW = {0,0,0x80000000,0x80000000}; +XMGLOBALCONST XMVECTORU32 g_XMFlipYW = {0,0x80000000,0,0x80000000}; +XMGLOBALCONST XMVECTORI32 g_XMMaskDec4 = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30}; +XMGLOBALCONST XMVECTORI32 g_XMXorDec4 = {0x200,0x200<<10,0x200<<20,0}; +XMGLOBALCONST XMVECTORF32 g_XMAddUDec4 = {0,0,0,32768.0f*65536.0f}; +XMGLOBALCONST XMVECTORF32 g_XMAddDec4 = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,0}; +XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = {1.0f,1.0f/1024.0f,1.0f/(1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)}; +XMGLOBALCONST XMVECTORU32 g_XMMaskByte4 = {0xFF,0xFF00,0xFF0000,0xFF000000}; +XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = {0x80,0x8000,0x800000,0x00000000}; +XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = {-128.0f,-128.0f*256.0f,-128.0f*65536.0f,0}; +XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f}; +XMGLOBALCONST XMVECTORF32 g_XMMaxInt = {65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f}; +XMGLOBALCONST XMVECTORF32 g_XMMaxUInt = {65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f}; +XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f}; +XMGLOBALCONST XMVECTORF32 g_XMsrgbScale = { 12.92f, 12.92f, 12.92f, 1.0f }; +XMGLOBALCONST XMVECTORF32 g_XMsrgbA = { 0.055f, 0.055f, 0.055f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMsrgbA1 = { 1.055f, 1.055f, 1.055f, 1.0f }; +XMGLOBALCONST XMVECTORI32 g_XMExponentBias = {127, 127, 127, 127}; +XMGLOBALCONST XMVECTORI32 g_XMSubnormalExponent = {-126, -126, -126, -126}; +XMGLOBALCONST XMVECTORI32 g_XMNumTrailing = {23, 23, 23, 23}; +XMGLOBALCONST XMVECTORI32 g_XMMinNormal = {0x00800000, 0x00800000, 0x00800000, 0x00800000}; +XMGLOBALCONST XMVECTORU32 g_XMNegInfinity = {0xFF800000, 0xFF800000, 0xFF800000, 0xFF800000}; +XMGLOBALCONST XMVECTORU32 g_XMNegQNaN = {0xFFC00000, 0xFFC00000, 0xFFC00000, 0xFFC00000}; +XMGLOBALCONST XMVECTORI32 g_XMBin128 = {0x43000000, 0x43000000, 0x43000000, 0x43000000}; +XMGLOBALCONST XMVECTORU32 g_XMBinNeg150 = {0xC3160000, 0xC3160000, 0xC3160000, 0xC3160000}; +XMGLOBALCONST XMVECTORI32 g_XM253 = {253, 253, 253, 253}; +XMGLOBALCONST XMVECTORF32 g_XMExpEst1 = {-6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f}; +XMGLOBALCONST XMVECTORF32 g_XMExpEst2 = {+2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f}; +XMGLOBALCONST XMVECTORF32 g_XMExpEst3 = {-5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f}; +XMGLOBALCONST XMVECTORF32 g_XMExpEst4 = {+9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f}; +XMGLOBALCONST XMVECTORF32 g_XMExpEst5 = {-1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f}; +XMGLOBALCONST XMVECTORF32 g_XMExpEst6 = {+1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f}; +XMGLOBALCONST XMVECTORF32 g_XMExpEst7 = {-1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f}; +XMGLOBALCONST XMVECTORF32 g_XMLogEst0 = {+1.442693f, +1.442693f, +1.442693f, +1.442693f}; +XMGLOBALCONST XMVECTORF32 g_XMLogEst1 = {-0.721242f, -0.721242f, -0.721242f, -0.721242f}; +XMGLOBALCONST XMVECTORF32 g_XMLogEst2 = {+0.479384f, +0.479384f, +0.479384f, +0.479384f}; +XMGLOBALCONST XMVECTORF32 g_XMLogEst3 = {-0.350295f, -0.350295f, -0.350295f, -0.350295f}; +XMGLOBALCONST XMVECTORF32 g_XMLogEst4 = {+0.248590f, +0.248590f, +0.248590f, +0.248590f}; +XMGLOBALCONST XMVECTORF32 g_XMLogEst5 = {-0.145700f, -0.145700f, -0.145700f, -0.145700f}; +XMGLOBALCONST XMVECTORF32 g_XMLogEst6 = {+0.057148f, +0.057148f, +0.057148f, +0.057148f}; +XMGLOBALCONST XMVECTORF32 g_XMLogEst7 = {-0.010578f, -0.010578f, -0.010578f, -0.010578f}; +XMGLOBALCONST XMVECTORF32 g_XMLgE = {+1.442695f, +1.442695f, +1.442695f, +1.442695f}; +XMGLOBALCONST XMVECTORF32 g_XMInvLgE = {+6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f}; +XMGLOBALCONST XMVECTORF32 g_UByteMax = {255.0f, 255.0f, 255.0f, 255.0f}; +XMGLOBALCONST XMVECTORF32 g_ByteMin = {-127.0f, -127.0f, -127.0f, -127.0f}; +XMGLOBALCONST XMVECTORF32 g_ByteMax = {127.0f, 127.0f, 127.0f, 127.0f}; +XMGLOBALCONST XMVECTORF32 g_ShortMin = {-32767.0f, -32767.0f, -32767.0f, -32767.0f}; +XMGLOBALCONST XMVECTORF32 g_ShortMax = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; +XMGLOBALCONST XMVECTORF32 g_UShortMax = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + +/**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable:4068 4214 4204 4365 4616 4640 6001 6101) +// C4068/4616: ignore unknown pragmas +// C4214/4204: nonstandard extension used +// C4365/4640: Off by default noise +// C6001/6101: False positives + +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0-(C0&1)) & 0x3F800000; + vResult.u[1] = (0-(C1&1)) & 0x3F800000; + vResult.u[2] = (0-(C2&1)) & 0x3F800000; + vResult.u[3] = (0-(C3&1)) & 0x3F800000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0-(C0&1)) & 0x3F800000; + vResult.u[1] = (0-(C1&1)) & 0x3F800000; + vResult.u[2] = (0-(C2&1)) & 0x3F800000; + vResult.u[3] = (0-(C3&1)) & 0x3F800000; + return vResult.v; +#else // XM_SSE_INTRINSICS_ + static const XMVECTORU32 g_vMask1 = {1,1,1,1}; + // Move the parms to a vector + __m128i vTemp = _mm_set_epi32(C3,C2,C1,C0); + // Mask off the low bits + vTemp = _mm_and_si128(vTemp,g_vMask1); + // 0xFFFFFFFF on true bits + vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1); + // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f + vTemp = _mm_and_si128(vTemp,g_XMOne); + return _mm_castsi128_ps(vTemp); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) +{ + assert( IntConstant >= -16 && IntConstant <= 15 ); + assert( DivExponent < 32 ); +#if defined(_XM_NO_INTRINSICS_) + + using DirectX::XMConvertVectorIntToFloat; + + XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant }; + return XMConvertVectorIntToFloat( V.v, DivExponent); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Splat the int + int32x4_t vScale = vdupq_n_s32(IntConstant); + // Convert to a float + XMVECTOR vResult = vcvtq_f32_s32(vScale); + // Convert DivExponent into 1.0f/(1<(&vScale)[0]); + return vResult; +#else // XM_SSE_INTRINSICS_ + // Splat the int + __m128i vScale = _mm_set1_epi32(IntConstant); + // Convert to a float + XMVECTOR vResult = _mm_cvtepi32_ps(vScale); + // Convert DivExponent into 1.0f/(1<= -16 && IntConstant <= 15 ); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant }; + return V.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t V = vdupq_n_s32( IntConstant ); + return reinterpret_cast(&V)[0]; +#else // XM_SSE_INTRINSICS_ + __m128i V = _mm_set1_epi32( IntConstant ); + return _mm_castsi128_ps(V); +#endif +} + +#include "DirectXMathConvert.inl" +#include "DirectXMathVector.inl" +#include "DirectXMathMatrix.inl" +#include "DirectXMathMisc.inl" + +#pragma prefast(pop) +#pragma warning(pop) + +}; // namespace DirectX + diff --git a/Inc/DirectXMathConvert.inl b/Inc/DirectXMathConvert.inl index 342397b..c7ab705 100644 --- a/Inc/DirectXMathConvert.inl +++ b/Inc/DirectXMathConvert.inl @@ -1,1899 +1,1899 @@ -//------------------------------------------------------------------------------------- -// DirectXMathConvert.inl -- SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#pragma once - -/**************************************************************************** - * - * Data conversion - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -#pragma warning(push) -#pragma warning(disable:4701) -// C4701: false positives - -inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat -( - FXMVECTOR VInt, - uint32_t DivExponent -) -{ - assert(DivExponent<32); -#if defined(_XM_NO_INTRINSICS_) - float fScale = 1.0f / (float)(1U << DivExponent); - uint32_t ElementIndex = 0; - XMVECTOR Result; - do { - int32_t iTemp = (int32_t)VInt.vector4_u32[ElementIndex]; - Result.vector4_f32[ElementIndex] = ((float)iTemp) * fScale; - } while (++ElementIndex<4); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float fScale = 1.0f / (float)(1U << DivExponent); - float32x4_t vResult = vcvtq_f32_s32( VInt ); - return vmulq_n_f32( vResult, fScale ); -#else // _XM_SSE_INTRINSICS_ - // Convert to floats - XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt)); - // Convert DivExponent into 1.0f/(1< (65536.0f*32768.0f)-128.0f) { - iResult = 0x7FFFFFFF; - } else { - iResult = (int32_t)fTemp; - } - Result.vector4_u32[ElementIndex] = (uint32_t)iResult; - } while (++ElementIndex<4); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vResult = vmulq_n_f32(VFloat, (float)(1U << MulExponent)); - // In case of positive overflow, detect it - uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxInt); - // Float to int conversion - int32x4_t vResulti = vcvtq_s32_f32(vResult); - // If there was positive overflow, set to 0x7FFFFFFF - vResult = vandq_u32(vOverflow,g_XMAbsMask); - vOverflow = vbicq_u32(vResulti,vOverflow); - vOverflow = vorrq_u32(vOverflow,vResult); - return vOverflow; -#else // _XM_SSE_INTRINSICS_ - XMVECTOR vResult = _mm_set_ps1((float)(1U << MulExponent)); - vResult = _mm_mul_ps(vResult,VFloat); - // In case of positive overflow, detect it - XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt); - // Float to int conversion - __m128i vResulti = _mm_cvttps_epi32(vResult); - // If there was positive overflow, set to 0x7FFFFFFF - vResult = _mm_and_ps(vOverflow,g_XMAbsMask); - vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); - vOverflow = _mm_or_ps(vOverflow,vResult); - return vOverflow; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat -( - FXMVECTOR VUInt, - uint32_t DivExponent -) -{ - assert(DivExponent<32); -#if defined(_XM_NO_INTRINSICS_) - float fScale = 1.0f / (float)(1U << DivExponent); - uint32_t ElementIndex = 0; - XMVECTOR Result; - do { - Result.vector4_f32[ElementIndex] = (float)VUInt.vector4_u32[ElementIndex] * fScale; - } while (++ElementIndex<4); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float fScale = 1.0f / (float)(1U << DivExponent); - float32x4_t vResult = vcvtq_f32_u32( VUInt ); - return vmulq_n_f32( vResult, fScale ); -#else // _XM_SSE_INTRINSICS_ - // For the values that are higher than 0x7FFFFFFF, a fixup is needed - // Determine which ones need the fix. - XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero); - // Force all values positive - XMVECTOR vResult = _mm_xor_ps(VUInt,vMask); - // Convert to floats - vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); - // Convert 0x80000000 -> 0xFFFFFFFF - __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); - // For only the ones that are too big, add the fixup - vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); - vResult = _mm_add_ps(vResult,vMask); - // Convert DivExponent into 1.0f/(1<= (65536.0f*65536.0f)) { - uResult = 0xFFFFFFFFU; - } else { - uResult = (uint32_t)fTemp; - } - Result.vector4_u32[ElementIndex] = uResult; - } while (++ElementIndex<4); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vResult = vmulq_n_f32(VFloat,(float)(1U << MulExponent)); - // In case of overflow, detect it - uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxUInt); - // Float to int conversion - uint32x4_t vResulti = vcvtq_u32_f32(vResult); - // If there was overflow, set to 0xFFFFFFFFU - vResult = vbicq_u32(vResulti,vOverflow); - vOverflow = vorrq_u32(vOverflow,vResult); - return vOverflow; -#else // _XM_SSE_INTRINSICS_ - XMVECTOR vResult = _mm_set_ps1(static_cast(1U << MulExponent)); - vResult = _mm_mul_ps(vResult,VFloat); - // Clamp to >=0 - vResult = _mm_max_ps(vResult,g_XMZero); - // Any numbers that are too big, set to 0xFFFFFFFFU - XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); - XMVECTOR vValue = g_XMUnsignedFix; - // Too large for a signed integer? - XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); - // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise - vValue = _mm_and_ps(vValue,vMask); - // Perform fixup only on numbers too large (Keeps low bit precision) - vResult = _mm_sub_ps(vResult,vValue); - __m128i vResulti = _mm_cvttps_epi32(vResult); - // Convert from signed to unsigned pnly if greater than 0x80000000 - vMask = _mm_and_ps(vMask,g_XMNegativeZero); - vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); - // On those that are too large, set to 0xFFFFFFFF - vResult = _mm_or_ps(vResult,vOverflow); - return vResult; -#endif -} - -#pragma warning(pop) - -/**************************************************************************** - * - * Vector and matrix load operations - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_u32[0] = *pSource; - V.vector4_u32[1] = 0; - V.vector4_u32[2] = 0; - V.vector4_u32[3] = 0; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t zero = vdupq_n_u32(0); - return vld1q_lane_u32( pSource, zero, 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_load_ss( reinterpret_cast(pSource) ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadFloat(const float* pSource) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_f32[0] = *pSource; - V.vector4_f32[1] = 0.f; - V.vector4_f32[2] = 0.f; - V.vector4_f32[3] = 0.f; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t zero = vdupq_n_f32(0); - return vld1q_lane_f32( pSource, zero, 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_load_ss( pSource ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadInt2 -( - const uint32_t* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_u32[0] = pSource[0]; - V.vector4_u32[1] = pSource[1]; - V.vector4_u32[2] = 0; - V.vector4_u32[3] = 0; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t x = vld1_u32( pSource ); - uint32x2_t zero = vdup_n_u32(0); - return vcombine_u32( x, zero ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128 x = _mm_load_ss( reinterpret_cast(pSource) ); - __m128 y = _mm_load_ss( reinterpret_cast(pSource+1) ); - return _mm_unpacklo_ps( x, y ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadInt2A -( - const uint32_t* pSource -) -{ - assert(pSource); - assert(((uintptr_t)pSource & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_u32[0] = pSource[0]; - V.vector4_u32[1] = pSource[1]; - V.vector4_u32[2] = 0; - V.vector4_u32[3] = 0; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t x = vld1_u32_ex( pSource, 64 ); - uint32x2_t zero = vdup_n_u32(0); - return vcombine_u32( x, zero ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadFloat2 -( - const XMFLOAT2* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_f32[0] = pSource->x; - V.vector4_f32[1] = pSource->y; - V.vector4_f32[2] = 0.f; - V.vector4_f32[3] = 0.f; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t x = vld1_f32( reinterpret_cast(pSource) ); - float32x2_t zero = vdup_n_f32(0); - return vcombine_f32( x, zero ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128 x = _mm_load_ss( &pSource->x ); - __m128 y = _mm_load_ss( &pSource->y ); - return _mm_unpacklo_ps( x, y ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadFloat2A -( - const XMFLOAT2A* pSource -) -{ - assert(pSource); - assert(((uintptr_t)pSource & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_f32[0] = pSource->x; - V.vector4_f32[1] = pSource->y; - V.vector4_f32[2] = 0.f; - V.vector4_f32[3] = 0.f; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t x = vld1_f32_ex( reinterpret_cast(pSource), 64 ); - float32x2_t zero = vdup_n_f32(0); - return vcombine_f32( x, zero ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadSInt2 -( - const XMINT2* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_f32[0] = (float)pSource->x; - V.vector4_f32[1] = (float)pSource->y; - V.vector4_f32[2] = 0.f; - V.vector4_f32[3] = 0.f; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - int32x2_t x = vld1_s32( reinterpret_cast(pSource) ); - float32x2_t v = vcvt_f32_s32( x ); - float32x2_t zero = vdup_n_f32(0); - return vcombine_f32( v, zero ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128 x = _mm_load_ss( reinterpret_cast(&pSource->x) ); - __m128 y = _mm_load_ss( reinterpret_cast(&pSource->y) ); - __m128 V = _mm_unpacklo_ps( x, y ); - return _mm_cvtepi32_ps(_mm_castps_si128(V)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadUInt2 -( - const XMUINT2* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_f32[0] = (float)pSource->x; - V.vector4_f32[1] = (float)pSource->y; - V.vector4_f32[2] = 0.f; - V.vector4_f32[3] = 0.f; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t x = vld1_u32( reinterpret_cast(pSource) ); - float32x2_t v = vcvt_f32_u32( x ); - float32x2_t zero = vdup_n_f32(0); - return vcombine_f32( v, zero ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128 x = _mm_load_ss( reinterpret_cast(&pSource->x) ); - __m128 y = _mm_load_ss( reinterpret_cast(&pSource->y) ); - __m128 V = _mm_unpacklo_ps( x, y ); - // For the values that are higher than 0x7FFFFFFF, a fixup is needed - // Determine which ones need the fix. - XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); - // Force all values positive - XMVECTOR vResult = _mm_xor_ps(V,vMask); - // Convert to floats - vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); - // Convert 0x80000000 -> 0xFFFFFFFF - __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); - // For only the ones that are too big, add the fixup - vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); - vResult = _mm_add_ps(vResult,vMask); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadInt3 -( - const uint32_t* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_u32[0] = pSource[0]; - V.vector4_u32[1] = pSource[1]; - V.vector4_u32[2] = pSource[2]; - V.vector4_u32[3] = 0; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t x = vld1_u32( pSource ); - uint32x2_t zero = vdup_n_u32(0); - uint32x2_t y = vld1_lane_u32( pSource+2, zero, 0 ); - return vcombine_u32( x, y ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128 x = _mm_load_ss( reinterpret_cast(pSource) ); - __m128 y = _mm_load_ss( reinterpret_cast(pSource+1) ); - __m128 z = _mm_load_ss( reinterpret_cast(pSource+2) ); - __m128 xy = _mm_unpacklo_ps( x, y ); - return _mm_movelh_ps( xy, z ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadInt3A -( - const uint32_t* pSource -) -{ - assert(pSource); - assert(((uintptr_t)pSource & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_u32[0] = pSource[0]; - V.vector4_u32[1] = pSource[1]; - V.vector4_u32[2] = pSource[2]; - V.vector4_u32[3] = 0; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Reads an extra integer which is zero'd - uint32x4_t V = vld1q_u32_ex( pSource, 128 ); - return vsetq_lane_u32( 0, V, 3 ); -#elif defined(_XM_SSE_INTRINSICS_) - // Reads an extra integer which is zero'd - __m128i V = _mm_load_si128( reinterpret_cast(pSource) ); - V = _mm_and_si128( V, g_XMMask3 ); - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadFloat3 -( - const XMFLOAT3* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_f32[0] = pSource->x; - V.vector4_f32[1] = pSource->y; - V.vector4_f32[2] = pSource->z; - V.vector4_f32[3] = 0.f; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t x = vld1_f32( reinterpret_cast(pSource) ); - float32x2_t zero = vdup_n_f32(0); - float32x2_t y = vld1_lane_f32( reinterpret_cast(pSource)+2, zero, 0 ); - return vcombine_f32( x, y ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128 x = _mm_load_ss( &pSource->x ); - __m128 y = _mm_load_ss( &pSource->y ); - __m128 z = _mm_load_ss( &pSource->z ); - __m128 xy = _mm_unpacklo_ps( x, y ); - return _mm_movelh_ps( xy, z ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadFloat3A -( - const XMFLOAT3A* pSource -) -{ - assert(pSource); - assert(((uintptr_t)pSource & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_f32[0] = pSource->x; - V.vector4_f32[1] = pSource->y; - V.vector4_f32[2] = pSource->z; - V.vector4_f32[3] = 0.f; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Reads an extra float which is zero'd - float32x4_t V = vld1q_f32_ex( reinterpret_cast(pSource), 128 ); - return vsetq_lane_f32( 0, V, 3 ); -#elif defined(_XM_SSE_INTRINSICS_) - // Reads an extra float which is zero'd - __m128 V = _mm_load_ps( &pSource->x ); - return _mm_and_ps( V, g_XMMask3 ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadSInt3 -( - const XMINT3* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR V; - V.vector4_f32[0] = (float)pSource->x; - V.vector4_f32[1] = (float)pSource->y; - V.vector4_f32[2] = (float)pSource->z; - V.vector4_f32[3] = 0.f; - return V; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - int32x2_t x = vld1_s32( reinterpret_cast(pSource) ); - int32x2_t zero = vdup_n_s32(0); - int32x2_t y = vld1_lane_s32( reinterpret_cast(pSource)+2, zero, 0 ); - int32x4_t v = vcombine_s32( x, y ); - return vcvtq_f32_s32( v ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128 x = _mm_load_ss( reinterpret_cast(&pSource->x) ); - __m128 y = _mm_load_ss( reinterpret_cast(&pSource->y) ); - __m128 z = _mm_load_ss( reinterpret_cast(&pSource->z) ); - __m128 xy = _mm_unpacklo_ps( x, y ); - __m128 V = _mm_movelh_ps( xy, z ); - return _mm_cvtepi32_ps(_mm_castps_si128(V)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadUInt3 -( - const XMUINT3* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_f32[0] = (float)pSource->x; - V.vector4_f32[1] = (float)pSource->y; - V.vector4_f32[2] = (float)pSource->z; - V.vector4_f32[3] = 0.f; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t x = vld1_u32( reinterpret_cast(pSource) ); - uint32x2_t zero = vdup_n_u32(0); - uint32x2_t y = vld1_lane_u32( reinterpret_cast(pSource)+2, zero, 0 ); - uint32x4_t v = vcombine_u32( x, y ); - return vcvtq_f32_u32( v ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128 x = _mm_load_ss( reinterpret_cast(&pSource->x) ); - __m128 y = _mm_load_ss( reinterpret_cast(&pSource->y) ); - __m128 z = _mm_load_ss( reinterpret_cast(&pSource->z) ); - __m128 xy = _mm_unpacklo_ps( x, y ); - __m128 V = _mm_movelh_ps( xy, z ); - // For the values that are higher than 0x7FFFFFFF, a fixup is needed - // Determine which ones need the fix. - XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); - // Force all values positive - XMVECTOR vResult = _mm_xor_ps(V,vMask); - // Convert to floats - vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); - // Convert 0x80000000 -> 0xFFFFFFFF - __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); - // For only the ones that are too big, add the fixup - vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); - vResult = _mm_add_ps(vResult,vMask); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadInt4 -( - const uint32_t* pSource -) -{ - assert(pSource); - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_u32[0] = pSource[0]; - V.vector4_u32[1] = pSource[1]; - V.vector4_u32[2] = pSource[2]; - V.vector4_u32[3] = pSource[3]; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_u32( pSource ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_loadu_si128( reinterpret_cast(pSource) ); - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadInt4A -( - const uint32_t* pSource -) -{ - assert(pSource); - assert(((uintptr_t)pSource & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_u32[0] = pSource[0]; - V.vector4_u32[1] = pSource[1]; - V.vector4_u32[2] = pSource[2]; - V.vector4_u32[3] = pSource[3]; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_u32_ex( pSource, 128 ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_load_si128( reinterpret_cast(pSource) ); - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadFloat4 -( - const XMFLOAT4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_f32[0] = pSource->x; - V.vector4_f32[1] = pSource->y; - V.vector4_f32[2] = pSource->z; - V.vector4_f32[3] = pSource->w; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_f32( reinterpret_cast(pSource) ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_loadu_ps( &pSource->x ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadFloat4A -( - const XMFLOAT4A* pSource -) -{ - assert(pSource); - assert(((uintptr_t)pSource & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_f32[0] = pSource->x; - V.vector4_f32[1] = pSource->y; - V.vector4_f32[2] = pSource->z; - V.vector4_f32[3] = pSource->w; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_f32_ex( reinterpret_cast(pSource), 128 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_load_ps( &pSource->x ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadSInt4 -( - const XMINT4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR V; - V.vector4_f32[0] = (float)pSource->x; - V.vector4_f32[1] = (float)pSource->y; - V.vector4_f32[2] = (float)pSource->z; - V.vector4_f32[3] = (float)pSource->w; - return V; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - int32x4_t v = vld1q_s32( reinterpret_cast(pSource) ); - return vcvtq_f32_s32( v ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_loadu_si128( reinterpret_cast(pSource) ); - return _mm_cvtepi32_ps(V); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMLoadUInt4 -( - const XMUINT4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR V; - V.vector4_f32[0] = (float)pSource->x; - V.vector4_f32[1] = (float)pSource->y; - V.vector4_f32[2] = (float)pSource->z; - V.vector4_f32[3] = (float)pSource->w; - return V; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t v = vld1q_u32( reinterpret_cast(pSource) ); - return vcvtq_f32_u32( v ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_loadu_si128( reinterpret_cast(pSource) ); - // For the values that are higher than 0x7FFFFFFF, a fixup is needed - // Determine which ones need the fix. - XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero); - // Force all values positive - XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask); - // Convert to floats - vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); - // Convert 0x80000000 -> 0xFFFFFFFF - __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); - // For only the ones that are too big, add the fixup - vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); - vResult = _mm_add_ps(vResult,vMask); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMMATRIX XM_CALLCONV XMLoadFloat3x3 -( - const XMFLOAT3X3* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - - XMMATRIX M; - M.r[0].vector4_f32[0] = pSource->m[0][0]; - M.r[0].vector4_f32[1] = pSource->m[0][1]; - M.r[0].vector4_f32[2] = pSource->m[0][2]; - M.r[0].vector4_f32[3] = 0.0f; - - M.r[1].vector4_f32[0] = pSource->m[1][0]; - M.r[1].vector4_f32[1] = pSource->m[1][1]; - M.r[1].vector4_f32[2] = pSource->m[1][2]; - M.r[1].vector4_f32[3] = 0.0f; - - M.r[2].vector4_f32[0] = pSource->m[2][0]; - M.r[2].vector4_f32[1] = pSource->m[2][1]; - M.r[2].vector4_f32[2] = pSource->m[2][2]; - M.r[2].vector4_f32[3] = 0.0f; - M.r[3].vector4_f32[0] = 0.0f; - M.r[3].vector4_f32[1] = 0.0f; - M.r[3].vector4_f32[2] = 0.0f; - M.r[3].vector4_f32[3] = 1.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t v0 = vld1q_f32( &pSource->m[0][0] ); - float32x4_t v1 = vld1q_f32( &pSource->m[1][1] ); - float32x2_t v2 = vcreate_f32( (uint64_t)*(const uint32_t*)&pSource->m[2][2] ); - float32x4_t T = vextq_f32( v0, v1, 3 ); - - XMMATRIX M; - M.r[0] = vandq_u32( v0, g_XMMask3 ); - M.r[1] = vandq_u32( T, g_XMMask3 ); - M.r[2] = vcombine_f32( vget_high_f32(v1), v2 ); - M.r[3] = g_XMIdentityR3; - return M; -#elif defined(_XM_SSE_INTRINSICS_) - __m128 Z = _mm_setzero_ps(); - - __m128 V1 = _mm_loadu_ps( &pSource->m[0][0] ); - __m128 V2 = _mm_loadu_ps( &pSource->m[1][1] ); - __m128 V3 = _mm_load_ss( &pSource->m[2][2] ); - - __m128 T1 = _mm_unpackhi_ps( V1, Z ); - __m128 T2 = _mm_unpacklo_ps( V2, Z ); - __m128 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) ); - __m128 T4 = _mm_movehl_ps( T2, T3 ); - __m128 T5 = _mm_movehl_ps( Z, T1 ); - - XMMATRIX M; - M.r[0] = _mm_movelh_ps( V1, T1 ); - M.r[1] = _mm_add_ps( T4, T5 ); - M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) ); - M.r[3] = g_XMIdentityR3; - return M; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMMATRIX XM_CALLCONV XMLoadFloat4x3 -( - const XMFLOAT4X3* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - - XMMATRIX M; - M.r[0].vector4_f32[0] = pSource->m[0][0]; - M.r[0].vector4_f32[1] = pSource->m[0][1]; - M.r[0].vector4_f32[2] = pSource->m[0][2]; - M.r[0].vector4_f32[3] = 0.0f; - - M.r[1].vector4_f32[0] = pSource->m[1][0]; - M.r[1].vector4_f32[1] = pSource->m[1][1]; - M.r[1].vector4_f32[2] = pSource->m[1][2]; - M.r[1].vector4_f32[3] = 0.0f; - - M.r[2].vector4_f32[0] = pSource->m[2][0]; - M.r[2].vector4_f32[1] = pSource->m[2][1]; - M.r[2].vector4_f32[2] = pSource->m[2][2]; - M.r[2].vector4_f32[3] = 0.0f; - - M.r[3].vector4_f32[0] = pSource->m[3][0]; - M.r[3].vector4_f32[1] = pSource->m[3][1]; - M.r[3].vector4_f32[2] = pSource->m[3][2]; - M.r[3].vector4_f32[3] = 1.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t v0 = vld1q_f32( &pSource->m[0][0] ); - float32x4_t v1 = vld1q_f32( &pSource->m[1][1] ); - float32x4_t v2 = vld1q_f32( &pSource->m[2][2] ); - - float32x4_t T1 = vextq_f32( v0, v1, 3 ); - float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) ); - float32x4_t T3 = vextq_f32( v2, v2, 1 ); - - XMMATRIX M; - M.r[0] = vandq_u32( v0, g_XMMask3 ); - M.r[1] = vandq_u32( T1, g_XMMask3 ); - M.r[2] = vandq_u32( T2, g_XMMask3 ); - M.r[3] = vsetq_lane_f32( 1.f, T3, 3 ); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - // Use unaligned load instructions to - // load the 12 floats - // vTemp1 = x1,y1,z1,x2 - XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]); - // vTemp2 = y2,z2,x3,y3 - XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]); - // vTemp4 = z3,x4,y4,z4 - XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]); - // vTemp3 = x3,y3,z3,z3 - XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); - // vTemp2 = y2,z2,x2,x2 - vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); - // vTemp2 = x2,y2,z2,z2 - vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2)); - // vTemp1 = x1,y1,z1,0 - vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); - // vTemp2 = x2,y2,z2,0 - vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); - // vTemp3 = x3,y3,z3,0 - vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); - // vTemp4i = x4,y4,z4,0 - __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8); - // vTemp4i = x4,y4,z4,1.0f - vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); - XMMATRIX M(vTemp1, - vTemp2, - vTemp3, - _mm_castsi128_ps(vTemp4i)); - return M; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A -( - const XMFLOAT4X3A* pSource -) -{ - assert(pSource); - assert(((uintptr_t)pSource & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - - XMMATRIX M; - M.r[0].vector4_f32[0] = pSource->m[0][0]; - M.r[0].vector4_f32[1] = pSource->m[0][1]; - M.r[0].vector4_f32[2] = pSource->m[0][2]; - M.r[0].vector4_f32[3] = 0.0f; - - M.r[1].vector4_f32[0] = pSource->m[1][0]; - M.r[1].vector4_f32[1] = pSource->m[1][1]; - M.r[1].vector4_f32[2] = pSource->m[1][2]; - M.r[1].vector4_f32[3] = 0.0f; - - M.r[2].vector4_f32[0] = pSource->m[2][0]; - M.r[2].vector4_f32[1] = pSource->m[2][1]; - M.r[2].vector4_f32[2] = pSource->m[2][2]; - M.r[2].vector4_f32[3] = 0.0f; - - M.r[3].vector4_f32[0] = pSource->m[3][0]; - M.r[3].vector4_f32[1] = pSource->m[3][1]; - M.r[3].vector4_f32[2] = pSource->m[3][2]; - M.r[3].vector4_f32[3] = 1.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t v0 = vld1q_f32_ex( &pSource->m[0][0], 128 ); - float32x4_t v1 = vld1q_f32_ex( &pSource->m[1][1], 128 ); - float32x4_t v2 = vld1q_f32_ex( &pSource->m[2][2], 128 ); - - float32x4_t T1 = vextq_f32( v0, v1, 3 ); - float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) ); - float32x4_t T3 = vextq_f32( v2, v2, 1 ); - - XMMATRIX M; - M.r[0] = vandq_u32( v0, g_XMMask3 ); - M.r[1] = vandq_u32( T1, g_XMMask3 ); - M.r[2] = vandq_u32( T2, g_XMMask3 ); - M.r[3] = vsetq_lane_f32( 1.f, T3, 3 ); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - // Use aligned load instructions to - // load the 12 floats - // vTemp1 = x1,y1,z1,x2 - XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]); - // vTemp2 = y2,z2,x3,y3 - XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]); - // vTemp4 = z3,x4,y4,z4 - XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]); - // vTemp3 = x3,y3,z3,z3 - XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); - // vTemp2 = y2,z2,x2,x2 - vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); - // vTemp2 = x2,y2,z2,z2 - vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2)); - // vTemp1 = x1,y1,z1,0 - vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); - // vTemp2 = x2,y2,z2,0 - vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); - // vTemp3 = x3,y3,z3,0 - vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); - // vTemp4i = x4,y4,z4,0 - __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8); - // vTemp4i = x4,y4,z4,1.0f - vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); - XMMATRIX M(vTemp1, - vTemp2, - vTemp3, - _mm_castsi128_ps(vTemp4i)); - return M; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMMATRIX XM_CALLCONV XMLoadFloat4x4 -( - const XMFLOAT4X4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - - XMMATRIX M; - M.r[0].vector4_f32[0] = pSource->m[0][0]; - M.r[0].vector4_f32[1] = pSource->m[0][1]; - M.r[0].vector4_f32[2] = pSource->m[0][2]; - M.r[0].vector4_f32[3] = pSource->m[0][3]; - - M.r[1].vector4_f32[0] = pSource->m[1][0]; - M.r[1].vector4_f32[1] = pSource->m[1][1]; - M.r[1].vector4_f32[2] = pSource->m[1][2]; - M.r[1].vector4_f32[3] = pSource->m[1][3]; - - M.r[2].vector4_f32[0] = pSource->m[2][0]; - M.r[2].vector4_f32[1] = pSource->m[2][1]; - M.r[2].vector4_f32[2] = pSource->m[2][2]; - M.r[2].vector4_f32[3] = pSource->m[2][3]; - - M.r[3].vector4_f32[0] = pSource->m[3][0]; - M.r[3].vector4_f32[1] = pSource->m[3][1]; - M.r[3].vector4_f32[2] = pSource->m[3][2]; - M.r[3].vector4_f32[3] = pSource->m[3][3]; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMMATRIX M; - M.r[0] = vld1q_f32( reinterpret_cast(&pSource->_11) ); - M.r[1] = vld1q_f32( reinterpret_cast(&pSource->_21) ); - M.r[2] = vld1q_f32( reinterpret_cast(&pSource->_31) ); - M.r[3] = vld1q_f32( reinterpret_cast(&pSource->_41) ); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX M; - M.r[0] = _mm_loadu_ps( &pSource->_11 ); - M.r[1] = _mm_loadu_ps( &pSource->_21 ); - M.r[2] = _mm_loadu_ps( &pSource->_31 ); - M.r[3] = _mm_loadu_ps( &pSource->_41 ); - return M; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMMATRIX XM_CALLCONV XMLoadFloat4x4A -( - const XMFLOAT4X4A* pSource -) -{ - assert(pSource); - assert(((uintptr_t)pSource & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - - XMMATRIX M; - M.r[0].vector4_f32[0] = pSource->m[0][0]; - M.r[0].vector4_f32[1] = pSource->m[0][1]; - M.r[0].vector4_f32[2] = pSource->m[0][2]; - M.r[0].vector4_f32[3] = pSource->m[0][3]; - - M.r[1].vector4_f32[0] = pSource->m[1][0]; - M.r[1].vector4_f32[1] = pSource->m[1][1]; - M.r[1].vector4_f32[2] = pSource->m[1][2]; - M.r[1].vector4_f32[3] = pSource->m[1][3]; - - M.r[2].vector4_f32[0] = pSource->m[2][0]; - M.r[2].vector4_f32[1] = pSource->m[2][1]; - M.r[2].vector4_f32[2] = pSource->m[2][2]; - M.r[2].vector4_f32[3] = pSource->m[2][3]; - - M.r[3].vector4_f32[0] = pSource->m[3][0]; - M.r[3].vector4_f32[1] = pSource->m[3][1]; - M.r[3].vector4_f32[2] = pSource->m[3][2]; - M.r[3].vector4_f32[3] = pSource->m[3][3]; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMMATRIX M; - M.r[0] = vld1q_f32_ex( reinterpret_cast(&pSource->_11), 128 ); - M.r[1] = vld1q_f32_ex( reinterpret_cast(&pSource->_21), 128 ); - M.r[2] = vld1q_f32_ex( reinterpret_cast(&pSource->_31), 128 ); - M.r[3] = vld1q_f32_ex( reinterpret_cast(&pSource->_41), 128 ); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX M; - M.r[0] = _mm_load_ps( &pSource->_11 ); - M.r[1] = _mm_load_ps( &pSource->_21 ); - M.r[2] = _mm_load_ps( &pSource->_31 ); - M.r[3] = _mm_load_ps( &pSource->_41 ); - return M; -#endif -} - -/**************************************************************************** - * - * Vector and matrix store operations - * - ****************************************************************************/ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreInt -( - uint32_t* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - *pDestination = XMVectorGetIntX( V ); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_lane_u32( pDestination, *reinterpret_cast(&V), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - _mm_store_ss( reinterpret_cast(pDestination), V ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreFloat -( - float* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - *pDestination = XMVectorGetX( V ); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_lane_f32( pDestination, V, 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - _mm_store_ss( pDestination, V ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreInt2 -( - uint32_t* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - pDestination[0] = V.vector4_u32[0]; - pDestination[1] = V.vector4_u32[1]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t VL = vget_low_u32(V); - vst1_u32( pDestination, VL ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); - _mm_store_ss( reinterpret_cast(&pDestination[0]), V ); - _mm_store_ss( reinterpret_cast(&pDestination[1]), T ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreInt2A -( - uint32_t* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); - assert(((uintptr_t)pDestination & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - pDestination[0] = V.vector4_u32[0]; - pDestination[1] = V.vector4_u32[1]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t VL = vget_low_u32(V); - vst1_u32_ex( pDestination, VL, 64 ); -#elif defined(_XM_SSE_INTRINSICS_) - _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreFloat2 -( - XMFLOAT2* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - pDestination->x = V.vector4_f32[0]; - pDestination->y = V.vector4_f32[1]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32(V); - vst1_f32( reinterpret_cast(pDestination), VL ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); - _mm_store_ss( &pDestination->x, V ); - _mm_store_ss( &pDestination->y, T ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreFloat2A -( - XMFLOAT2A* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); - assert(((uintptr_t)pDestination & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - pDestination->x = V.vector4_f32[0]; - pDestination->y = V.vector4_f32[1]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32(V); - vst1_f32_ex( reinterpret_cast(pDestination), VL, 64 ); -#elif defined(_XM_SSE_INTRINSICS_) - _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreSInt2 -( - XMINT2* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - pDestination->x = (int32_t)V.vector4_f32[0]; - pDestination->y = (int32_t)V.vector4_f32[1]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - int32x2_t v = vget_low_s32(V); - v = vcvt_s32_f32( v ); - vst1_s32( reinterpret_cast(pDestination), v ); -#elif defined(_XM_SSE_INTRINSICS_) - // In case of positive overflow, detect it - XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); - // Float to int conversion - __m128i vResulti = _mm_cvttps_epi32(V); - // If there was positive overflow, set to 0x7FFFFFFF - XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); - vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); - vOverflow = _mm_or_ps(vOverflow,vResult); - // Write two ints - XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) ); - _mm_store_ss( reinterpret_cast(&pDestination->x), vOverflow ); - _mm_store_ss( reinterpret_cast(&pDestination->y), T ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreUInt2 -( - XMUINT2* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - pDestination->x = (uint32_t)V.vector4_f32[0]; - pDestination->y = (uint32_t)V.vector4_f32[1]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t v = vget_low_f32(V); - uint32x2_t iv = vcvt_u32_f32( v ); - vst1_u32( reinterpret_cast(pDestination), iv ); -#elif defined(_XM_SSE_INTRINSICS_) - // Clamp to >=0 - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - // Any numbers that are too big, set to 0xFFFFFFFFU - XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); - XMVECTOR vValue = g_XMUnsignedFix; - // Too large for a signed integer? - XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); - // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise - vValue = _mm_and_ps(vValue,vMask); - // Perform fixup only on numbers too large (Keeps low bit precision) - vResult = _mm_sub_ps(vResult,vValue); - __m128i vResulti = _mm_cvttps_epi32(vResult); - // Convert from signed to unsigned pnly if greater than 0x80000000 - vMask = _mm_and_ps(vMask,g_XMNegativeZero); - vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); - // On those that are too large, set to 0xFFFFFFFF - vResult = _mm_or_ps(vResult,vOverflow); - // Write two uints - XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) ); - _mm_store_ss( reinterpret_cast(&pDestination->x), vResult ); - _mm_store_ss( reinterpret_cast(&pDestination->y), T ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreInt3 -( - uint32_t* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - pDestination[0] = V.vector4_u32[0]; - pDestination[1] = V.vector4_u32[1]; - pDestination[2] = V.vector4_u32[2]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t VL = vget_low_u32(V); - vst1_u32( pDestination, VL ); - vst1q_lane_u32( pDestination+2, *reinterpret_cast(&V), 2 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - _mm_store_ss( reinterpret_cast(pDestination), V ); - _mm_store_ss( reinterpret_cast(&pDestination[1]), T1 ); - _mm_store_ss( reinterpret_cast(&pDestination[2]), T2 ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreInt3A -( - uint32_t* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); - assert(((uintptr_t)pDestination & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - pDestination[0] = V.vector4_u32[0]; - pDestination[1] = V.vector4_u32[1]; - pDestination[2] = V.vector4_u32[2]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t VL = vget_low_u32(V); - vst1_u32_ex( pDestination, VL, 64 ); - vst1q_lane_u32( pDestination+2, *reinterpret_cast(&V), 2 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); - _mm_store_ss( reinterpret_cast(&pDestination[2]), T ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreFloat3 -( - XMFLOAT3* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - pDestination->x = V.vector4_f32[0]; - pDestination->y = V.vector4_f32[1]; - pDestination->z = V.vector4_f32[2]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32(V); - vst1_f32( reinterpret_cast(pDestination), VL ); - vst1q_lane_f32( reinterpret_cast(pDestination)+2, V, 2 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - _mm_store_ss( &pDestination->x, V ); - _mm_store_ss( &pDestination->y, T1 ); - _mm_store_ss( &pDestination->z, T2 ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreFloat3A -( - XMFLOAT3A* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); - assert(((uintptr_t)pDestination & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - pDestination->x = V.vector4_f32[0]; - pDestination->y = V.vector4_f32[1]; - pDestination->z = V.vector4_f32[2]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32(V); - vst1_f32_ex( reinterpret_cast(pDestination), VL, 64 ); - vst1q_lane_f32( reinterpret_cast(pDestination)+2, V, 2 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); - _mm_store_ss( &pDestination->z, T ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreSInt3 -( - XMINT3* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - pDestination->x = (int32_t)V.vector4_f32[0]; - pDestination->y = (int32_t)V.vector4_f32[1]; - pDestination->z = (int32_t)V.vector4_f32[2]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - int32x4_t v = vcvtq_s32_f32(V); - int32x2_t vL = vget_low_s32(v); - vst1_s32( reinterpret_cast(pDestination), vL ); - vst1q_lane_s32( reinterpret_cast(pDestination)+2, v, 2 ); -#elif defined(_XM_SSE_INTRINSICS_) - // In case of positive overflow, detect it - XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); - // Float to int conversion - __m128i vResulti = _mm_cvttps_epi32(V); - // If there was positive overflow, set to 0x7FFFFFFF - XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); - vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); - vOverflow = _mm_or_ps(vOverflow,vResult); - // Write 3 uints - XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2)); - _mm_store_ss( reinterpret_cast(&pDestination->x), vOverflow ); - _mm_store_ss( reinterpret_cast(&pDestination->y), T1 ); - _mm_store_ss( reinterpret_cast(&pDestination->z), T2 ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreUInt3 -( - XMUINT3* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - pDestination->x = (uint32_t)V.vector4_f32[0]; - pDestination->y = (uint32_t)V.vector4_f32[1]; - pDestination->z = (uint32_t)V.vector4_f32[2]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t v = vcvtq_u32_f32(V); - uint32x2_t vL = vget_low_u32(v); - vst1_u32( reinterpret_cast(pDestination), vL ); - vst1q_lane_u32( reinterpret_cast(pDestination)+2, v, 2 ); -#elif defined(_XM_SSE_INTRINSICS_) - // Clamp to >=0 - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - // Any numbers that are too big, set to 0xFFFFFFFFU - XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); - XMVECTOR vValue = g_XMUnsignedFix; - // Too large for a signed integer? - XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); - // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise - vValue = _mm_and_ps(vValue,vMask); - // Perform fixup only on numbers too large (Keeps low bit precision) - vResult = _mm_sub_ps(vResult,vValue); - __m128i vResulti = _mm_cvttps_epi32(vResult); - // Convert from signed to unsigned pnly if greater than 0x80000000 - vMask = _mm_and_ps(vMask,g_XMNegativeZero); - vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); - // On those that are too large, set to 0xFFFFFFFF - vResult = _mm_or_ps(vResult,vOverflow); - // Write 3 uints - XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2)); - _mm_store_ss( reinterpret_cast(&pDestination->x), vResult ); - _mm_store_ss( reinterpret_cast(&pDestination->y), T1 ); - _mm_store_ss( reinterpret_cast(&pDestination->z), T2 ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreInt4 -( - uint32_t* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - pDestination[0] = V.vector4_u32[0]; - pDestination[1] = V.vector4_u32[1]; - pDestination[2] = V.vector4_u32[2]; - pDestination[3] = V.vector4_u32[3]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_u32( pDestination, V ); -#elif defined(_XM_SSE_INTRINSICS_) - _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreInt4A -( - uint32_t* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); - assert(((uintptr_t)pDestination & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - pDestination[0] = V.vector4_u32[0]; - pDestination[1] = V.vector4_u32[1]; - pDestination[2] = V.vector4_u32[2]; - pDestination[3] = V.vector4_u32[3]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_u32_ex( pDestination, V, 128 ); -#elif defined(_XM_SSE_INTRINSICS_) - _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreFloat4 -( - XMFLOAT4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - pDestination->x = V.vector4_f32[0]; - pDestination->y = V.vector4_f32[1]; - pDestination->z = V.vector4_f32[2]; - pDestination->w = V.vector4_f32[3]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_f32( reinterpret_cast(pDestination), V ); -#elif defined(_XM_SSE_INTRINSICS_) - _mm_storeu_ps( &pDestination->x, V ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreFloat4A -( - XMFLOAT4A* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); - assert(((uintptr_t)pDestination & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - pDestination->x = V.vector4_f32[0]; - pDestination->y = V.vector4_f32[1]; - pDestination->z = V.vector4_f32[2]; - pDestination->w = V.vector4_f32[3]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_f32_ex( reinterpret_cast(pDestination), V, 128 ); -#elif defined(_XM_SSE_INTRINSICS_) - _mm_store_ps( &pDestination->x, V ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreSInt4 -( - XMINT4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - pDestination->x = (int32_t)V.vector4_f32[0]; - pDestination->y = (int32_t)V.vector4_f32[1]; - pDestination->z = (int32_t)V.vector4_f32[2]; - pDestination->w = (int32_t)V.vector4_f32[3]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - int32x4_t v = vcvtq_s32_f32(V); - vst1q_s32( reinterpret_cast(pDestination), v ); -#elif defined(_XM_SSE_INTRINSICS_) - // In case of positive overflow, detect it - XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); - // Float to int conversion - __m128i vResulti = _mm_cvttps_epi32(V); - // If there was positive overflow, set to 0x7FFFFFFF - XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); - vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); - vOverflow = _mm_or_ps(vOverflow,vResult); - _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreUInt4 -( - XMUINT4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - pDestination->x = (uint32_t)V.vector4_f32[0]; - pDestination->y = (uint32_t)V.vector4_f32[1]; - pDestination->z = (uint32_t)V.vector4_f32[2]; - pDestination->w = (uint32_t)V.vector4_f32[3]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t v = vcvtq_u32_f32(V); - vst1q_u32( reinterpret_cast(pDestination), v ); -#elif defined(_XM_SSE_INTRINSICS_) - // Clamp to >=0 - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - // Any numbers that are too big, set to 0xFFFFFFFFU - XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); - XMVECTOR vValue = g_XMUnsignedFix; - // Too large for a signed integer? - XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); - // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise - vValue = _mm_and_ps(vValue,vMask); - // Perform fixup only on numbers too large (Keeps low bit precision) - vResult = _mm_sub_ps(vResult,vValue); - __m128i vResulti = _mm_cvttps_epi32(vResult); - // Convert from signed to unsigned pnly if greater than 0x80000000 - vMask = _mm_and_ps(vMask,g_XMNegativeZero); - vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); - // On those that are too large, set to 0xFFFFFFFF - vResult = _mm_or_ps(vResult,vOverflow); - _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreFloat3x3 -( - XMFLOAT3X3* pDestination, - FXMMATRIX M -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - pDestination->m[0][0] = M.r[0].vector4_f32[0]; - pDestination->m[0][1] = M.r[0].vector4_f32[1]; - pDestination->m[0][2] = M.r[0].vector4_f32[2]; - - pDestination->m[1][0] = M.r[1].vector4_f32[0]; - pDestination->m[1][1] = M.r[1].vector4_f32[1]; - pDestination->m[1][2] = M.r[1].vector4_f32[2]; - - pDestination->m[2][0] = M.r[2].vector4_f32[0]; - pDestination->m[2][1] = M.r[2].vector4_f32[1]; - pDestination->m[2][2] = M.r[2].vector4_f32[2]; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 ); - float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); - vst1q_f32( &pDestination->m[0][0], T2 ); - - T1 = vextq_f32( M.r[1], M.r[1], 1 ); - T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); - vst1q_f32( &pDestination->m[1][1], T2 ); - - vst1q_lane_f32( &pDestination->m[2][2], M.r[2], 2 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp1 = M.r[0]; - XMVECTOR vTemp2 = M.r[1]; - XMVECTOR vTemp3 = M.r[2]; - XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2)); - vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0)); - _mm_storeu_ps(&pDestination->m[0][0],vTemp1); - vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); - _mm_storeu_ps(&pDestination->m[1][1],vTemp2); - vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2)); - _mm_store_ss(&pDestination->m[2][2],vTemp3); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreFloat4x3 -( - XMFLOAT4X3* pDestination, - FXMMATRIX M -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - pDestination->m[0][0] = M.r[0].vector4_f32[0]; - pDestination->m[0][1] = M.r[0].vector4_f32[1]; - pDestination->m[0][2] = M.r[0].vector4_f32[2]; - - pDestination->m[1][0] = M.r[1].vector4_f32[0]; - pDestination->m[1][1] = M.r[1].vector4_f32[1]; - pDestination->m[1][2] = M.r[1].vector4_f32[2]; - - pDestination->m[2][0] = M.r[2].vector4_f32[0]; - pDestination->m[2][1] = M.r[2].vector4_f32[1]; - pDestination->m[2][2] = M.r[2].vector4_f32[2]; - - pDestination->m[3][0] = M.r[3].vector4_f32[0]; - pDestination->m[3][1] = M.r[3].vector4_f32[1]; - pDestination->m[3][2] = M.r[3].vector4_f32[2]; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 ); - float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); - vst1q_f32( &pDestination->m[0][0], T2 ); - - T1 = vextq_f32( M.r[1], M.r[1], 1 ); - T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); - vst1q_f32( &pDestination->m[1][1], T2 ); - - T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 ); - T2 = vextq_f32( T1, M.r[3], 3 ); - vst1q_f32( &pDestination->m[2][2], T2 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp1 = M.r[0]; - XMVECTOR vTemp2 = M.r[1]; - XMVECTOR vTemp3 = M.r[2]; - XMVECTOR vTemp4 = M.r[3]; - XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); - vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0)); - vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0)); - vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); - vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); - _mm_storeu_ps(&pDestination->m[0][0],vTemp1); - _mm_storeu_ps(&pDestination->m[1][1],vTemp2x); - _mm_storeu_ps(&pDestination->m[2][2],vTemp3); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreFloat4x3A -( - XMFLOAT4X3A* pDestination, - FXMMATRIX M -) -{ - assert(pDestination); - assert(((uintptr_t)pDestination & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - - pDestination->m[0][0] = M.r[0].vector4_f32[0]; - pDestination->m[0][1] = M.r[0].vector4_f32[1]; - pDestination->m[0][2] = M.r[0].vector4_f32[2]; - - pDestination->m[1][0] = M.r[1].vector4_f32[0]; - pDestination->m[1][1] = M.r[1].vector4_f32[1]; - pDestination->m[1][2] = M.r[1].vector4_f32[2]; - - pDestination->m[2][0] = M.r[2].vector4_f32[0]; - pDestination->m[2][1] = M.r[2].vector4_f32[1]; - pDestination->m[2][2] = M.r[2].vector4_f32[2]; - - pDestination->m[3][0] = M.r[3].vector4_f32[0]; - pDestination->m[3][1] = M.r[3].vector4_f32[1]; - pDestination->m[3][2] = M.r[3].vector4_f32[2]; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 ); - float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); - vst1q_f32_ex( &pDestination->m[0][0], T2, 128 ); - - T1 = vextq_f32( M.r[1], M.r[1], 1 ); - T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); - vst1q_f32_ex( &pDestination->m[1][1], T2, 128 ); - - T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 ); - T2 = vextq_f32( T1, M.r[3], 3 ); - vst1q_f32_ex( &pDestination->m[2][2], T2, 128 ); -#elif defined(_XM_SSE_INTRINSICS_) - // x1,y1,z1,w1 - XMVECTOR vTemp1 = M.r[0]; - // x2,y2,z2,w2 - XMVECTOR vTemp2 = M.r[1]; - // x3,y3,z3,w3 - XMVECTOR vTemp3 = M.r[2]; - // x4,y4,z4,w4 - XMVECTOR vTemp4 = M.r[3]; - // z1,z1,x2,y2 - XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2)); - // y2,z2,x3,y3 (Final) - vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); - // x1,y1,z1,x2 (Final) - vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0)); - // z3,z3,x4,x4 - vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); - // z3,x4,y4,z4 (Final) - vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); - // Store in 3 operations - _mm_store_ps(&pDestination->m[0][0],vTemp1); - _mm_store_ps(&pDestination->m[1][1],vTemp2); - _mm_store_ps(&pDestination->m[2][2],vTemp3); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreFloat4x4 -( - XMFLOAT4X4* pDestination, - FXMMATRIX M -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - pDestination->m[0][0] = M.r[0].vector4_f32[0]; - pDestination->m[0][1] = M.r[0].vector4_f32[1]; - pDestination->m[0][2] = M.r[0].vector4_f32[2]; - pDestination->m[0][3] = M.r[0].vector4_f32[3]; - - pDestination->m[1][0] = M.r[1].vector4_f32[0]; - pDestination->m[1][1] = M.r[1].vector4_f32[1]; - pDestination->m[1][2] = M.r[1].vector4_f32[2]; - pDestination->m[1][3] = M.r[1].vector4_f32[3]; - - pDestination->m[2][0] = M.r[2].vector4_f32[0]; - pDestination->m[2][1] = M.r[2].vector4_f32[1]; - pDestination->m[2][2] = M.r[2].vector4_f32[2]; - pDestination->m[2][3] = M.r[2].vector4_f32[3]; - - pDestination->m[3][0] = M.r[3].vector4_f32[0]; - pDestination->m[3][1] = M.r[3].vector4_f32[1]; - pDestination->m[3][2] = M.r[3].vector4_f32[2]; - pDestination->m[3][3] = M.r[3].vector4_f32[3]; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_f32( reinterpret_cast(&pDestination->_11), M.r[0] ); - vst1q_f32( reinterpret_cast(&pDestination->_21), M.r[1] ); - vst1q_f32( reinterpret_cast(&pDestination->_31), M.r[2] ); - vst1q_f32( reinterpret_cast(&pDestination->_41), M.r[3] ); -#elif defined(_XM_SSE_INTRINSICS_) - _mm_storeu_ps( &pDestination->_11, M.r[0] ); - _mm_storeu_ps( &pDestination->_21, M.r[1] ); - _mm_storeu_ps( &pDestination->_31, M.r[2] ); - _mm_storeu_ps( &pDestination->_41, M.r[3] ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMStoreFloat4x4A -( - XMFLOAT4X4A* pDestination, - FXMMATRIX M -) -{ - assert(pDestination); - assert(((uintptr_t)pDestination & 0xF) == 0); -#if defined(_XM_NO_INTRINSICS_) - - pDestination->m[0][0] = M.r[0].vector4_f32[0]; - pDestination->m[0][1] = M.r[0].vector4_f32[1]; - pDestination->m[0][2] = M.r[0].vector4_f32[2]; - pDestination->m[0][3] = M.r[0].vector4_f32[3]; - - pDestination->m[1][0] = M.r[1].vector4_f32[0]; - pDestination->m[1][1] = M.r[1].vector4_f32[1]; - pDestination->m[1][2] = M.r[1].vector4_f32[2]; - pDestination->m[1][3] = M.r[1].vector4_f32[3]; - - pDestination->m[2][0] = M.r[2].vector4_f32[0]; - pDestination->m[2][1] = M.r[2].vector4_f32[1]; - pDestination->m[2][2] = M.r[2].vector4_f32[2]; - pDestination->m[2][3] = M.r[2].vector4_f32[3]; - - pDestination->m[3][0] = M.r[3].vector4_f32[0]; - pDestination->m[3][1] = M.r[3].vector4_f32[1]; - pDestination->m[3][2] = M.r[3].vector4_f32[2]; - pDestination->m[3][3] = M.r[3].vector4_f32[3]; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_f32_ex( reinterpret_cast(&pDestination->_11), M.r[0], 128 ); - vst1q_f32_ex( reinterpret_cast(&pDestination->_21), M.r[1], 128 ); - vst1q_f32_ex( reinterpret_cast(&pDestination->_31), M.r[2], 128 ); - vst1q_f32_ex( reinterpret_cast(&pDestination->_41), M.r[3], 128 ); -#elif defined(_XM_SSE_INTRINSICS_) - _mm_store_ps( &pDestination->_11, M.r[0] ); - _mm_store_ps( &pDestination->_21, M.r[1] ); - _mm_store_ps( &pDestination->_31, M.r[2] ); - _mm_store_ps( &pDestination->_41, M.r[3] ); -#endif -} - +//------------------------------------------------------------------------------------- +// DirectXMathConvert.inl -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +#pragma warning(push) +#pragma warning(disable:4701) +// C4701: false positives + +inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat +( + FXMVECTOR VInt, + uint32_t DivExponent +) +{ + assert(DivExponent<32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / (float)(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + int32_t iTemp = (int32_t)VInt.vector4_u32[ElementIndex]; + Result.vector4_f32[ElementIndex] = ((float)iTemp) * fScale; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fScale = 1.0f / (float)(1U << DivExponent); + float32x4_t vResult = vcvtq_f32_s32( VInt ); + return vmulq_n_f32( vResult, fScale ); +#else // _XM_SSE_INTRINSICS_ + // Convert to floats + XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt)); + // Convert DivExponent into 1.0f/(1< (65536.0f*32768.0f)-128.0f) { + iResult = 0x7FFFFFFF; + } else { + iResult = (int32_t)fTemp; + } + Result.vector4_u32[ElementIndex] = (uint32_t)iResult; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmulq_n_f32(VFloat, (float)(1U << MulExponent)); + // In case of positive overflow, detect it + uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxInt); + // Float to int conversion + int32x4_t vResulti = vcvtq_s32_f32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = vandq_u32(vOverflow,g_XMAbsMask); + vOverflow = vbicq_u32(vResulti,vOverflow); + vOverflow = vorrq_u32(vOverflow,vResult); + return vOverflow; +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1((float)(1U << MulExponent)); + vResult = _mm_mul_ps(vResult,VFloat); + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + return vOverflow; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat +( + FXMVECTOR VUInt, + uint32_t DivExponent +) +{ + assert(DivExponent<32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / (float)(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + Result.vector4_f32[ElementIndex] = (float)VUInt.vector4_u32[ElementIndex] * fScale; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fScale = 1.0f / (float)(1U << DivExponent); + float32x4_t vResult = vcvtq_f32_u32( VUInt ); + return vmulq_n_f32( vResult, fScale ); +#else // _XM_SSE_INTRINSICS_ + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(VUInt,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + // Convert DivExponent into 1.0f/(1<= (65536.0f*65536.0f)) { + uResult = 0xFFFFFFFFU; + } else { + uResult = (uint32_t)fTemp; + } + Result.vector4_u32[ElementIndex] = uResult; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmulq_n_f32(VFloat,(float)(1U << MulExponent)); + // In case of overflow, detect it + uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxUInt); + // Float to int conversion + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + // If there was overflow, set to 0xFFFFFFFFU + vResult = vbicq_u32(vResulti,vOverflow); + vOverflow = vorrq_u32(vOverflow,vResult); + return vOverflow; +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1(static_cast(1U << MulExponent)); + vResult = _mm_mul_ps(vResult,VFloat); + // Clamp to >=0 + vResult = _mm_max_ps(vResult,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + return vResult; +#endif +} + +#pragma warning(pop) + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = *pSource; + V.vector4_u32[1] = 0; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t zero = vdupq_n_u32(0); + return vld1q_lane_u32( pSource, zero, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss( reinterpret_cast(pSource) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat(const float* pSource) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = *pSource; + V.vector4_f32[1] = 0.f; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t zero = vdupq_n_f32(0); + return vld1q_lane_f32( pSource, zero, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss( pSource ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt2 +( + const uint32_t* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32( pSource ); + uint32x2_t zero = vdup_n_u32(0); + return vcombine_u32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast(pSource) ); + __m128 y = _mm_load_ss( reinterpret_cast(pSource+1) ); + return _mm_unpacklo_ps( x, y ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt2A +( + const uint32_t* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32_ex( pSource, 64 ); + uint32x2_t zero = vdup_n_u32(0); + return vcombine_u32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat2 +( + const XMFLOAT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t x = vld1_f32( reinterpret_cast(pSource) ); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( &pSource->x ); + __m128 y = _mm_load_ss( &pSource->y ); + return _mm_unpacklo_ps( x, y ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat2A +( + const XMFLOAT2A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t x = vld1_f32_ex( reinterpret_cast(pSource), 64 ); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt2 +( + const XMINT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x2_t x = vld1_s32( reinterpret_cast(pSource) ); + float32x2_t v = vcvt_f32_s32( x ); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32( v, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast(&pSource->y) ); + __m128 V = _mm_unpacklo_ps( x, y ); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt2 +( + const XMUINT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32( reinterpret_cast(pSource) ); + float32x2_t v = vcvt_f32_u32( x ); + float32x2_t zero = vdup_n_f32(0); + return vcombine_f32( v, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast(&pSource->y) ); + __m128 V = _mm_unpacklo_ps( x, y ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt3 +( + const uint32_t* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32( pSource ); + uint32x2_t zero = vdup_n_u32(0); + uint32x2_t y = vld1_lane_u32( pSource+2, zero, 0 ); + return vcombine_u32( x, y ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast(pSource) ); + __m128 y = _mm_load_ss( reinterpret_cast(pSource+1) ); + __m128 z = _mm_load_ss( reinterpret_cast(pSource+2) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + return _mm_movelh_ps( xy, z ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt3A +( + const uint32_t* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra integer which is zero'd + uint32x4_t V = vld1q_u32_ex( pSource, 128 ); + return vsetq_lane_u32( 0, V, 3 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Reads an extra integer which is zero'd + __m128i V = _mm_load_si128( reinterpret_cast(pSource) ); + V = _mm_and_si128( V, g_XMMask3 ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3 +( + const XMFLOAT3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t x = vld1_f32( reinterpret_cast(pSource) ); + float32x2_t zero = vdup_n_f32(0); + float32x2_t y = vld1_lane_f32( reinterpret_cast(pSource)+2, zero, 0 ); + return vcombine_f32( x, y ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( &pSource->x ); + __m128 y = _mm_load_ss( &pSource->y ); + __m128 z = _mm_load_ss( &pSource->z ); + __m128 xy = _mm_unpacklo_ps( x, y ); + return _mm_movelh_ps( xy, z ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat3A +( + const XMFLOAT3A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra float which is zero'd + float32x4_t V = vld1q_f32_ex( reinterpret_cast(pSource), 128 ); + return vsetq_lane_f32( 0, V, 3 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Reads an extra float which is zero'd + __m128 V = _mm_load_ps( &pSource->x ); + return _mm_and_ps( V, g_XMMask3 ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt3 +( + const XMINT3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = 0.f; + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x2_t x = vld1_s32( reinterpret_cast(pSource) ); + int32x2_t zero = vdup_n_s32(0); + int32x2_t y = vld1_lane_s32( reinterpret_cast(pSource)+2, zero, 0 ); + int32x4_t v = vcombine_s32( x, y ); + return vcvtq_f32_s32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast(&pSource->y) ); + __m128 z = _mm_load_ss( reinterpret_cast(&pSource->z) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + __m128 V = _mm_movelh_ps( xy, z ); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt3 +( + const XMUINT3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t x = vld1_u32( reinterpret_cast(pSource) ); + uint32x2_t zero = vdup_n_u32(0); + uint32x2_t y = vld1_lane_u32( reinterpret_cast(pSource)+2, zero, 0 ); + uint32x4_t v = vcombine_u32( x, y ); + return vcvtq_f32_u32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast(&pSource->y) ); + __m128 z = _mm_load_ss( reinterpret_cast(&pSource->z) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + __m128 V = _mm_movelh_ps( xy, z ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt4 +( + const uint32_t* pSource +) +{ + assert(pSource); + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_u32( pSource ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128( reinterpret_cast(pSource) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadInt4A +( + const uint32_t* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_u32_ex( pSource, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_load_si128( reinterpret_cast(pSource) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat4 +( + const XMFLOAT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_f32( reinterpret_cast(pSource) ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_loadu_ps( &pSource->x ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadFloat4A +( + const XMFLOAT4A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_f32_ex( reinterpret_cast(pSource), 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps( &pSource->x ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadSInt4 +( + const XMINT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = (float)pSource->w; + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vld1q_s32( reinterpret_cast(pSource) ); + return vcvtq_f32_s32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128( reinterpret_cast(pSource) ); + return _mm_cvtepi32_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMLoadUInt4 +( + const XMUINT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = (float)pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vld1q_u32( reinterpret_cast(pSource) ); + return vcvtq_f32_u32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128( reinterpret_cast(pSource) ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat3x3 +( + const XMFLOAT3X3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + M.r[3].vector4_f32[0] = 0.0f; + M.r[3].vector4_f32[1] = 0.0f; + M.r[3].vector4_f32[2] = 0.0f; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t v0 = vld1q_f32( &pSource->m[0][0] ); + float32x4_t v1 = vld1q_f32( &pSource->m[1][1] ); + float32x2_t v2 = vcreate_f32( (uint64_t)*(const uint32_t*)&pSource->m[2][2] ); + float32x4_t T = vextq_f32( v0, v1, 3 ); + + XMMATRIX M; + M.r[0] = vandq_u32( v0, g_XMMask3 ); + M.r[1] = vandq_u32( T, g_XMMask3 ); + M.r[2] = vcombine_f32( vget_high_f32(v1), v2 ); + M.r[3] = g_XMIdentityR3; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 Z = _mm_setzero_ps(); + + __m128 V1 = _mm_loadu_ps( &pSource->m[0][0] ); + __m128 V2 = _mm_loadu_ps( &pSource->m[1][1] ); + __m128 V3 = _mm_load_ss( &pSource->m[2][2] ); + + __m128 T1 = _mm_unpackhi_ps( V1, Z ); + __m128 T2 = _mm_unpacklo_ps( V2, Z ); + __m128 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) ); + __m128 T4 = _mm_movehl_ps( T2, T3 ); + __m128 T5 = _mm_movehl_ps( Z, T1 ); + + XMMATRIX M; + M.r[0] = _mm_movelh_ps( V1, T1 ); + M.r[1] = _mm_add_ps( T4, T5 ); + M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) ); + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x3 +( + const XMFLOAT4X3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t v0 = vld1q_f32( &pSource->m[0][0] ); + float32x4_t v1 = vld1q_f32( &pSource->m[1][1] ); + float32x4_t v2 = vld1q_f32( &pSource->m[2][2] ); + + float32x4_t T1 = vextq_f32( v0, v1, 3 ); + float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) ); + float32x4_t T3 = vextq_f32( v2, v2, 1 ); + + XMMATRIX M; + M.r[0] = vandq_u32( v0, g_XMMask3 ); + M.r[1] = vandq_u32( T1, g_XMMask3 ); + M.r[2] = vandq_u32( T2, g_XMMask3 ); + M.r[3] = vsetq_lane_f32( 1.f, T3, 3 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use unaligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A +( + const XMFLOAT4X3A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t v0 = vld1q_f32_ex( &pSource->m[0][0], 128 ); + float32x4_t v1 = vld1q_f32_ex( &pSource->m[1][1], 128 ); + float32x4_t v2 = vld1q_f32_ex( &pSource->m[2][2], 128 ); + + float32x4_t T1 = vextq_f32( v0, v1, 3 ); + float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) ); + float32x4_t T3 = vextq_f32( v2, v2, 1 ); + + XMMATRIX M; + M.r[0] = vandq_u32( v0, g_XMMask3 ); + M.r[1] = vandq_u32( T1, g_XMMask3 ); + M.r[2] = vandq_u32( T2, g_XMMask3 ); + M.r[3] = vsetq_lane_f32( 1.f, T3, 3 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use aligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x4 +( + const XMFLOAT4X4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vld1q_f32( reinterpret_cast(&pSource->_11) ); + M.r[1] = vld1q_f32( reinterpret_cast(&pSource->_21) ); + M.r[2] = vld1q_f32( reinterpret_cast(&pSource->_31) ); + M.r[3] = vld1q_f32( reinterpret_cast(&pSource->_41) ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_loadu_ps( &pSource->_11 ); + M.r[1] = _mm_loadu_ps( &pSource->_21 ); + M.r[2] = _mm_loadu_ps( &pSource->_31 ); + M.r[3] = _mm_loadu_ps( &pSource->_41 ); + return M; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMLoadFloat4x4A +( + const XMFLOAT4X4A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vld1q_f32_ex( reinterpret_cast(&pSource->_11), 128 ); + M.r[1] = vld1q_f32_ex( reinterpret_cast(&pSource->_21), 128 ); + M.r[2] = vld1q_f32_ex( reinterpret_cast(&pSource->_31), 128 ); + M.r[3] = vld1q_f32_ex( reinterpret_cast(&pSource->_41), 128 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_load_ps( &pSource->_11 ); + M.r[1] = _mm_load_ps( &pSource->_21 ); + M.r[2] = _mm_load_ps( &pSource->_31 ); + M.r[3] = _mm_load_ps( &pSource->_41 ); + return M; +#endif +} + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetIntX( V ); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32( pDestination, *reinterpret_cast(&V), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss( reinterpret_cast(pDestination), V ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat +( + float* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetX( V ); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32( pDestination, V, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss( pDestination, V ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt2 +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(V); + vst1_u32( pDestination, VL ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( reinterpret_cast(&pDestination[0]), V ); + _mm_store_ss( reinterpret_cast(&pDestination[1]), T ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt2A +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(V); + vst1_u32_ex( pDestination, VL, 64 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat2 +( + XMFLOAT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32( reinterpret_cast(pDestination), VL ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( &pDestination->x, V ); + _mm_store_ss( &pDestination->y, T ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat2A +( + XMFLOAT2A* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32_ex( reinterpret_cast(pDestination), VL, 64 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt2 +( + XMINT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (int32_t)V.vector4_f32[0]; + pDestination->y = (int32_t)V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x2_t v = vget_low_s32(V); + v = vcvt_s32_f32( v ); + vst1_s32( reinterpret_cast(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + // Write two ints + XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( reinterpret_cast(&pDestination->x), vOverflow ); + _mm_store_ss( reinterpret_cast(&pDestination->y), T ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt2 +( + XMUINT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (uint32_t)V.vector4_f32[0]; + pDestination->y = (uint32_t)V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v = vget_low_f32(V); + uint32x2_t iv = vcvt_u32_f32( v ); + vst1_u32( reinterpret_cast(pDestination), iv ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + // Write two uints + XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( reinterpret_cast(&pDestination->x), vResult ); + _mm_store_ss( reinterpret_cast(&pDestination->y), T ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt3 +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(V); + vst1_u32( pDestination, VL ); + vst1q_lane_u32( pDestination+2, *reinterpret_cast(&V), 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( reinterpret_cast(pDestination), V ); + _mm_store_ss( reinterpret_cast(&pDestination[1]), T1 ); + _mm_store_ss( reinterpret_cast(&pDestination[2]), T2 ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt3A +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t VL = vget_low_u32(V); + vst1_u32_ex( pDestination, VL, 64 ); + vst1q_lane_u32( pDestination+2, *reinterpret_cast(&V), 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); + _mm_store_ss( reinterpret_cast(&pDestination[2]), T ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3 +( + XMFLOAT3* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32( reinterpret_cast(pDestination), VL ); + vst1q_lane_f32( reinterpret_cast(pDestination)+2, V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( &pDestination->x, V ); + _mm_store_ss( &pDestination->y, T1 ); + _mm_store_ss( &pDestination->z, T2 ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3A +( + XMFLOAT3A* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + vst1_f32_ex( reinterpret_cast(pDestination), VL, 64 ); + vst1q_lane_f32( reinterpret_cast(pDestination)+2, V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); + _mm_store_ss( &pDestination->z, T ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt3 +( + XMINT3* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (int32_t)V.vector4_f32[0]; + pDestination->y = (int32_t)V.vector4_f32[1]; + pDestination->z = (int32_t)V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vcvtq_s32_f32(V); + int32x2_t vL = vget_low_s32(v); + vst1_s32( reinterpret_cast(pDestination), vL ); + vst1q_lane_s32( reinterpret_cast(pDestination)+2, v, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + // Write 3 uints + XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( reinterpret_cast(&pDestination->x), vOverflow ); + _mm_store_ss( reinterpret_cast(&pDestination->y), T1 ); + _mm_store_ss( reinterpret_cast(&pDestination->z), T2 ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt3 +( + XMUINT3* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (uint32_t)V.vector4_f32[0]; + pDestination->y = (uint32_t)V.vector4_f32[1]; + pDestination->z = (uint32_t)V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vcvtq_u32_f32(V); + uint32x2_t vL = vget_low_u32(v); + vst1_u32( reinterpret_cast(pDestination), vL ); + vst1q_lane_u32( reinterpret_cast(pDestination)+2, v, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + // Write 3 uints + XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( reinterpret_cast(&pDestination->x), vResult ); + _mm_store_ss( reinterpret_cast(&pDestination->y), T1 ); + _mm_store_ss( reinterpret_cast(&pDestination->z), T2 ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt4 +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_u32( pDestination, V ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreInt4A +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_u32_ex( pDestination, V, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4 +( + XMFLOAT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32( reinterpret_cast(pDestination), V ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps( &pDestination->x, V ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4A +( + XMFLOAT4A* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32_ex( reinterpret_cast(pDestination), V, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps( &pDestination->x, V ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreSInt4 +( + XMINT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (int32_t)V.vector4_f32[0]; + pDestination->y = (int32_t)V.vector4_f32[1]; + pDestination->z = (int32_t)V.vector4_f32[2]; + pDestination->w = (int32_t)V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t v = vcvtq_s32_f32(V); + vst1q_s32( reinterpret_cast(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreUInt4 +( + XMUINT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (uint32_t)V.vector4_f32[0]; + pDestination->y = (uint32_t)V.vector4_f32[1]; + pDestination->z = (uint32_t)V.vector4_f32[2]; + pDestination->w = (uint32_t)V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t v = vcvtq_u32_f32(V); + vst1q_u32( reinterpret_cast(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat3x3 +( + XMFLOAT3X3* pDestination, + FXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 ); + float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); + vst1q_f32( &pDestination->m[0][0], T2 ); + + T1 = vextq_f32( M.r[1], M.r[1], 1 ); + T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); + vst1q_f32( &pDestination->m[1][1], T2 ); + + vst1q_lane_f32( &pDestination->m[2][2], M.r[2], 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2)); + vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0)); + _mm_storeu_ps(&pDestination->m[0][0],vTemp1); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + _mm_storeu_ps(&pDestination->m[1][1],vTemp2); + vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(&pDestination->m[2][2],vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x3 +( + XMFLOAT4X3* pDestination, + FXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 ); + float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); + vst1q_f32( &pDestination->m[0][0], T2 ); + + T1 = vextq_f32( M.r[1], M.r[1], 1 ); + T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); + vst1q_f32( &pDestination->m[1][1], T2 ); + + T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 ); + T2 = vextq_f32( T1, M.r[3], 3 ); + vst1q_f32( &pDestination->m[2][2], T2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vTemp4 = M.r[3]; + XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0)); + vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0)); + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); + _mm_storeu_ps(&pDestination->m[0][0],vTemp1); + _mm_storeu_ps(&pDestination->m[1][1],vTemp2x); + _mm_storeu_ps(&pDestination->m[2][2],vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x3A +( + XMFLOAT4X3A* pDestination, + FXMMATRIX M +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 ); + float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); + vst1q_f32_ex( &pDestination->m[0][0], T2, 128 ); + + T1 = vextq_f32( M.r[1], M.r[1], 1 ); + T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); + vst1q_f32_ex( &pDestination->m[1][1], T2, 128 ); + + T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 ); + T2 = vextq_f32( T1, M.r[3], 3 ); + vst1q_f32_ex( &pDestination->m[2][2], T2, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + // x1,y1,z1,w1 + XMVECTOR vTemp1 = M.r[0]; + // x2,y2,z2,w2 + XMVECTOR vTemp2 = M.r[1]; + // x3,y3,z3,w3 + XMVECTOR vTemp3 = M.r[2]; + // x4,y4,z4,w4 + XMVECTOR vTemp4 = M.r[3]; + // z1,z1,x2,y2 + XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2)); + // y2,z2,x3,y3 (Final) + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + // x1,y1,z1,x2 (Final) + vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0)); + // z3,z3,x4,x4 + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); + // z3,x4,y4,z4 (Final) + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); + // Store in 3 operations + _mm_store_ps(&pDestination->m[0][0],vTemp1); + _mm_store_ps(&pDestination->m[1][1],vTemp2); + _mm_store_ps(&pDestination->m[2][2],vTemp3); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x4 +( + XMFLOAT4X4* pDestination, + FXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32( reinterpret_cast(&pDestination->_11), M.r[0] ); + vst1q_f32( reinterpret_cast(&pDestination->_21), M.r[1] ); + vst1q_f32( reinterpret_cast(&pDestination->_31), M.r[2] ); + vst1q_f32( reinterpret_cast(&pDestination->_41), M.r[3] ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps( &pDestination->_11, M.r[0] ); + _mm_storeu_ps( &pDestination->_21, M.r[1] ); + _mm_storeu_ps( &pDestination->_31, M.r[2] ); + _mm_storeu_ps( &pDestination->_41, M.r[3] ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMStoreFloat4x4A +( + XMFLOAT4X4A* pDestination, + FXMMATRIX M +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32_ex( reinterpret_cast(&pDestination->_11), M.r[0], 128 ); + vst1q_f32_ex( reinterpret_cast(&pDestination->_21), M.r[1], 128 ); + vst1q_f32_ex( reinterpret_cast(&pDestination->_31), M.r[2], 128 ); + vst1q_f32_ex( reinterpret_cast(&pDestination->_41), M.r[3], 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps( &pDestination->_11, M.r[0] ); + _mm_store_ps( &pDestination->_21, M.r[1] ); + _mm_store_ps( &pDestination->_31, M.r[2] ); + _mm_store_ps( &pDestination->_41, M.r[3] ); +#endif +} + diff --git a/Inc/DirectXMathMatrix.inl b/Inc/DirectXMathMatrix.inl index 79157f3..5257938 100644 --- a/Inc/DirectXMathMatrix.inl +++ b/Inc/DirectXMathMatrix.inl @@ -1,3306 +1,3306 @@ -//------------------------------------------------------------------------------------- -// DirectXMathMatrix.inl -- SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#pragma once - -/**************************************************************************** - * - * Matrix - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ -// Comparison operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -// Return true if any entry in the matrix is NaN -inline bool XM_CALLCONV XMMatrixIsNaN -( - FXMMATRIX M -) -{ -#if defined(_XM_NO_INTRINSICS_) - size_t i = 16; - const uint32_t *pWork = (const uint32_t *)(&M.m[0][0]); - do { - // Fetch value into integer unit - uint32_t uTest = pWork[0]; - // Remove sign - uTest &= 0x7FFFFFFFU; - // NaN is 0x7F800001 through 0x7FFFFFFF inclusive - uTest -= 0x7F800001U; - if (uTest<0x007FFFFFU) { - break; // NaN found - } - ++pWork; // Next entry - } while (--i); - return (i!=0); // i == 0 if nothing matched -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Load in registers - XMVECTOR vX = M.r[0]; - XMVECTOR vY = M.r[1]; - XMVECTOR vZ = M.r[2]; - XMVECTOR vW = M.r[3]; - // Test themselves to check for NaN - vX = vmvnq_u32(vceqq_f32(vX, vX)); - vY = vmvnq_u32(vceqq_f32(vY, vY)); - vZ = vmvnq_u32(vceqq_f32(vZ, vZ)); - vW = vmvnq_u32(vceqq_f32(vW, vW)); - // Or all the results - vX = vorrq_u32(vX,vZ); - vY = vorrq_u32(vY,vW); - vX = vorrq_u32(vX,vY); - // If any tested true, return true - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vX), vget_high_u8(vX)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1); - return (r != 0); -#elif defined(_XM_SSE_INTRINSICS_) - // Load in registers - XMVECTOR vX = M.r[0]; - XMVECTOR vY = M.r[1]; - XMVECTOR vZ = M.r[2]; - XMVECTOR vW = M.r[3]; - // Test themselves to check for NaN - vX = _mm_cmpneq_ps(vX,vX); - vY = _mm_cmpneq_ps(vY,vY); - vZ = _mm_cmpneq_ps(vZ,vZ); - vW = _mm_cmpneq_ps(vW,vW); - // Or all the results - vX = _mm_or_ps(vX,vZ); - vY = _mm_or_ps(vY,vW); - vX = _mm_or_ps(vX,vY); - // If any tested true, return true - return (_mm_movemask_ps(vX)!=0); -#else -#endif -} - -//------------------------------------------------------------------------------ - -// Return true if any entry in the matrix is +/-INF -inline bool XM_CALLCONV XMMatrixIsInfinite -( - FXMMATRIX M -) -{ -#if defined(_XM_NO_INTRINSICS_) - size_t i = 16; - const uint32_t *pWork = (const uint32_t *)(&M.m[0][0]); - do { - // Fetch value into integer unit - uint32_t uTest = pWork[0]; - // Remove sign - uTest &= 0x7FFFFFFFU; - // INF is 0x7F800000 - if (uTest==0x7F800000U) { - break; // INF found - } - ++pWork; // Next entry - } while (--i); - return (i!=0); // i == 0 if nothing matched -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Mask off the sign bits - XMVECTOR vTemp1 = vandq_u32(M.r[0],g_XMAbsMask); - XMVECTOR vTemp2 = vandq_u32(M.r[1],g_XMAbsMask); - XMVECTOR vTemp3 = vandq_u32(M.r[2],g_XMAbsMask); - XMVECTOR vTemp4 = vandq_u32(M.r[3],g_XMAbsMask); - // Compare to infinity - vTemp1 = vceqq_f32(vTemp1,g_XMInfinity); - vTemp2 = vceqq_f32(vTemp2,g_XMInfinity); - vTemp3 = vceqq_f32(vTemp3,g_XMInfinity); - vTemp4 = vceqq_f32(vTemp4,g_XMInfinity); - // Or the answers together - vTemp1 = vorrq_u32(vTemp1,vTemp2); - vTemp3 = vorrq_u32(vTemp3,vTemp4); - vTemp1 = vorrq_u32(vTemp1,vTemp3); - // If any are infinity, the signs are true. - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1); - return (r != 0); -#elif defined(_XM_SSE_INTRINSICS_) - // Mask off the sign bits - XMVECTOR vTemp1 = _mm_and_ps(M.r[0],g_XMAbsMask); - XMVECTOR vTemp2 = _mm_and_ps(M.r[1],g_XMAbsMask); - XMVECTOR vTemp3 = _mm_and_ps(M.r[2],g_XMAbsMask); - XMVECTOR vTemp4 = _mm_and_ps(M.r[3],g_XMAbsMask); - // Compare to infinity - vTemp1 = _mm_cmpeq_ps(vTemp1,g_XMInfinity); - vTemp2 = _mm_cmpeq_ps(vTemp2,g_XMInfinity); - vTemp3 = _mm_cmpeq_ps(vTemp3,g_XMInfinity); - vTemp4 = _mm_cmpeq_ps(vTemp4,g_XMInfinity); - // Or the answers together - vTemp1 = _mm_or_ps(vTemp1,vTemp2); - vTemp3 = _mm_or_ps(vTemp3,vTemp4); - vTemp1 = _mm_or_ps(vTemp1,vTemp3); - // If any are infinity, the signs are true. - return (_mm_movemask_ps(vTemp1)!=0); -#endif -} - -//------------------------------------------------------------------------------ - -// Return true if the XMMatrix is equal to identity -inline bool XM_CALLCONV XMMatrixIsIdentity -( - FXMMATRIX M -) -{ -#if defined(_XM_NO_INTRINSICS_) - // Use the integer pipeline to reduce branching to a minimum - const uint32_t *pWork = (const uint32_t*)(&M.m[0][0]); - // Convert 1.0f to zero and or them together - uint32_t uOne = pWork[0]^0x3F800000U; - // Or all the 0.0f entries together - uint32_t uZero = pWork[1]; - uZero |= pWork[2]; - uZero |= pWork[3]; - // 2nd row - uZero |= pWork[4]; - uOne |= pWork[5]^0x3F800000U; - uZero |= pWork[6]; - uZero |= pWork[7]; - // 3rd row - uZero |= pWork[8]; - uZero |= pWork[9]; - uOne |= pWork[10]^0x3F800000U; - uZero |= pWork[11]; - // 4th row - uZero |= pWork[12]; - uZero |= pWork[13]; - uZero |= pWork[14]; - uOne |= pWork[15]^0x3F800000U; - // If all zero entries are zero, the uZero==0 - uZero &= 0x7FFFFFFF; // Allow -0.0f - // If all 1.0f entries are 1.0f, then uOne==0 - uOne |= uZero; - return (uOne==0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR vTemp1 = vceqq_f32(M.r[0],g_XMIdentityR0); - XMVECTOR vTemp2 = vceqq_f32(M.r[1],g_XMIdentityR1); - XMVECTOR vTemp3 = vceqq_f32(M.r[2],g_XMIdentityR2); - XMVECTOR vTemp4 = vceqq_f32(M.r[3],g_XMIdentityR3); - vTemp1 = vandq_u32(vTemp1,vTemp2); - vTemp3 = vandq_u32(vTemp3,vTemp4); - vTemp1 = vandq_u32(vTemp1,vTemp3); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1); - return ( r == 0xFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0],g_XMIdentityR0); - XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1],g_XMIdentityR1); - XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2],g_XMIdentityR2); - XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3],g_XMIdentityR3); - vTemp1 = _mm_and_ps(vTemp1,vTemp2); - vTemp3 = _mm_and_ps(vTemp3,vTemp4); - vTemp1 = _mm_and_ps(vTemp1,vTemp3); - return (_mm_movemask_ps(vTemp1)==0x0f); -#endif -} - -//------------------------------------------------------------------------------ -// Computation operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ -// Perform a 4x4 matrix multiply by a 4x4 matrix -inline XMMATRIX XM_CALLCONV XMMatrixMultiply -( - FXMMATRIX M1, - CXMMATRIX M2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMMATRIX mResult; - // Cache the invariants in registers - float x = M1.m[0][0]; - float y = M1.m[0][1]; - float z = M1.m[0][2]; - float w = M1.m[0][3]; - // Perform the operation on the first row - mResult.m[0][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); - mResult.m[0][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); - mResult.m[0][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); - mResult.m[0][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); - // Repeat for all the other rows - x = M1.m[1][0]; - y = M1.m[1][1]; - z = M1.m[1][2]; - w = M1.m[1][3]; - mResult.m[1][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); - mResult.m[1][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); - mResult.m[1][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); - mResult.m[1][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); - x = M1.m[2][0]; - y = M1.m[2][1]; - z = M1.m[2][2]; - w = M1.m[2][3]; - mResult.m[2][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); - mResult.m[2][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); - mResult.m[2][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); - mResult.m[2][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); - x = M1.m[3][0]; - y = M1.m[3][1]; - z = M1.m[3][2]; - w = M1.m[3][3]; - mResult.m[3][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); - mResult.m[3][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); - mResult.m[3][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); - mResult.m[3][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); - return mResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMMATRIX mResult; - float32x2_t VL = vget_low_f32( M1.r[0] ); - float32x2_t VH = vget_high_f32( M1.r[0] ); - // Perform the operation on the first row - XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0); - XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1); - XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); - XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); - mResult.r[0] = vaddq_f32( vZ, vW ); - // Repeat for the other 3 rows - VL = vget_low_f32( M1.r[1] ); - VH = vget_high_f32( M1.r[1] ); - vX = vmulq_lane_f32(M2.r[0], VL, 0); - vY = vmulq_lane_f32(M2.r[1], VL, 1); - vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); - vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); - mResult.r[1] = vaddq_f32( vZ, vW ); - VL = vget_low_f32( M1.r[2] ); - VH = vget_high_f32( M1.r[2] ); - vX = vmulq_lane_f32(M2.r[0], VL, 0); - vY = vmulq_lane_f32(M2.r[1], VL, 1); - vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); - vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); - mResult.r[2] = vaddq_f32( vZ, vW ); - VL = vget_low_f32( M1.r[3] ); - VH = vget_high_f32( M1.r[3] ); - vX = vmulq_lane_f32(M2.r[0], VL, 0); - vY = vmulq_lane_f32(M2.r[1], VL, 1); - vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); - vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); - mResult.r[3] = vaddq_f32( vZ, vW ); - return mResult; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX mResult; - // Use vW to hold the original row - XMVECTOR vW = M1.r[0]; - // Splat the component X,Y,Z then W - XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); - XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); - vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); - // Perform the operation on the first row - vX = _mm_mul_ps(vX,M2.r[0]); - vY = _mm_mul_ps(vY,M2.r[1]); - vZ = _mm_mul_ps(vZ,M2.r[2]); - vW = _mm_mul_ps(vW,M2.r[3]); - // Perform a binary add to reduce cumulative errors - vX = _mm_add_ps(vX,vZ); - vY = _mm_add_ps(vY,vW); - vX = _mm_add_ps(vX,vY); - mResult.r[0] = vX; - // Repeat for the other 3 rows - vW = M1.r[1]; - vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); - vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); - vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vY = _mm_mul_ps(vY,M2.r[1]); - vZ = _mm_mul_ps(vZ,M2.r[2]); - vW = _mm_mul_ps(vW,M2.r[3]); - vX = _mm_add_ps(vX,vZ); - vY = _mm_add_ps(vY,vW); - vX = _mm_add_ps(vX,vY); - mResult.r[1] = vX; - vW = M1.r[2]; - vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); - vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); - vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vY = _mm_mul_ps(vY,M2.r[1]); - vZ = _mm_mul_ps(vZ,M2.r[2]); - vW = _mm_mul_ps(vW,M2.r[3]); - vX = _mm_add_ps(vX,vZ); - vY = _mm_add_ps(vY,vW); - vX = _mm_add_ps(vX,vY); - mResult.r[2] = vX; - vW = M1.r[3]; - vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); - vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); - vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vY = _mm_mul_ps(vY,M2.r[1]); - vZ = _mm_mul_ps(vZ,M2.r[2]); - vW = _mm_mul_ps(vW,M2.r[3]); - vX = _mm_add_ps(vX,vZ); - vY = _mm_add_ps(vY,vW); - vX = _mm_add_ps(vX,vY); - mResult.r[3] = vX; - return mResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose -( - FXMMATRIX M1, - CXMMATRIX M2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMMATRIX mResult; - // Cache the invariants in registers - float x = M2.m[0][0]; - float y = M2.m[1][0]; - float z = M2.m[2][0]; - float w = M2.m[3][0]; - // Perform the operation on the first row - mResult.m[0][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); - mResult.m[0][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); - mResult.m[0][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); - mResult.m[0][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); - // Repeat for all the other rows - x = M2.m[0][1]; - y = M2.m[1][1]; - z = M2.m[2][1]; - w = M2.m[3][1]; - mResult.m[1][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); - mResult.m[1][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); - mResult.m[1][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); - mResult.m[1][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); - x = M2.m[0][2]; - y = M2.m[1][2]; - z = M2.m[2][2]; - w = M2.m[3][2]; - mResult.m[2][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); - mResult.m[2][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); - mResult.m[2][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); - mResult.m[2][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); - x = M2.m[0][3]; - y = M2.m[1][3]; - z = M2.m[2][3]; - w = M2.m[3][3]; - mResult.m[3][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); - mResult.m[3][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); - mResult.m[3][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); - mResult.m[3][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); - return mResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32( M1.r[0] ); - float32x2_t VH = vget_high_f32( M1.r[0] ); - // Perform the operation on the first row - XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0); - XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1); - XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); - XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); - float32x4_t r0 = vaddq_f32( vZ, vW ); - // Repeat for the other 3 rows - VL = vget_low_f32( M1.r[1] ); - VH = vget_high_f32( M1.r[1] ); - vX = vmulq_lane_f32(M2.r[0], VL, 0); - vY = vmulq_lane_f32(M2.r[1], VL, 1); - vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); - vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); - float32x4_t r1 = vaddq_f32( vZ, vW ); - VL = vget_low_f32( M1.r[2] ); - VH = vget_high_f32( M1.r[2] ); - vX = vmulq_lane_f32(M2.r[0], VL, 0); - vY = vmulq_lane_f32(M2.r[1], VL, 1); - vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); - vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); - float32x4_t r2 = vaddq_f32( vZ, vW ); - VL = vget_low_f32( M1.r[3] ); - VH = vget_high_f32( M1.r[3] ); - vX = vmulq_lane_f32(M2.r[0], VL, 0); - vY = vmulq_lane_f32(M2.r[1], VL, 1); - vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); - vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); - float32x4_t r3 = vaddq_f32( vZ, vW ); - - // Transpose result - float32x4x2_t P0 = vzipq_f32( r0, r2 ); - float32x4x2_t P1 = vzipq_f32( r1, r3 ); - - float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] ); - float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] ); - - XMMATRIX mResult; - mResult.r[0] = T0.val[0]; - mResult.r[1] = T0.val[1]; - mResult.r[2] = T1.val[0]; - mResult.r[3] = T1.val[1]; - return mResult; -#elif defined(_XM_SSE_INTRINSICS_) - // Use vW to hold the original row - XMVECTOR vW = M1.r[0]; - // Splat the component X,Y,Z then W - XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); - XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); - vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); - // Perform the operation on the first row - vX = _mm_mul_ps(vX,M2.r[0]); - vY = _mm_mul_ps(vY,M2.r[1]); - vZ = _mm_mul_ps(vZ,M2.r[2]); - vW = _mm_mul_ps(vW,M2.r[3]); - // Perform a binary add to reduce cumulative errors - vX = _mm_add_ps(vX,vZ); - vY = _mm_add_ps(vY,vW); - vX = _mm_add_ps(vX,vY); - __m128 r0 = vX; - // Repeat for the other 3 rows - vW = M1.r[1]; - vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); - vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); - vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vY = _mm_mul_ps(vY,M2.r[1]); - vZ = _mm_mul_ps(vZ,M2.r[2]); - vW = _mm_mul_ps(vW,M2.r[3]); - vX = _mm_add_ps(vX,vZ); - vY = _mm_add_ps(vY,vW); - vX = _mm_add_ps(vX,vY); - __m128 r1 = vX; - vW = M1.r[2]; - vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); - vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); - vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vY = _mm_mul_ps(vY,M2.r[1]); - vZ = _mm_mul_ps(vZ,M2.r[2]); - vW = _mm_mul_ps(vW,M2.r[3]); - vX = _mm_add_ps(vX,vZ); - vY = _mm_add_ps(vY,vW); - vX = _mm_add_ps(vX,vY); - __m128 r2 = vX; - vW = M1.r[3]; - vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); - vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); - vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); - vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); - vX = _mm_mul_ps(vX,M2.r[0]); - vY = _mm_mul_ps(vY,M2.r[1]); - vZ = _mm_mul_ps(vZ,M2.r[2]); - vW = _mm_mul_ps(vW,M2.r[3]); - vX = _mm_add_ps(vX,vZ); - vY = _mm_add_ps(vY,vW); - vX = _mm_add_ps(vX,vY); - __m128 r3 = vX; - - // x.x,x.y,y.x,y.y - XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); - // x.z,x.w,y.z,y.w - XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); - // z.x,z.y,w.x,w.y - XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); - // z.z,z.w,w.z,w.w - XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); - - XMMATRIX mResult; - // x.x,y.x,z.x,w.x - mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); - // x.y,y.y,z.y,w.y - mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); - // x.z,y.z,z.z,w.z - mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); - // x.w,y.w,z.w,w.w - mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); - return mResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixTranspose -( - FXMMATRIX M -) -{ -#if defined(_XM_NO_INTRINSICS_) - - // Original matrix: - // - // m00m01m02m03 - // m10m11m12m13 - // m20m21m22m23 - // m30m31m32m33 - - XMMATRIX P; - P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21 - P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31 - P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23 - P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33 - - XMMATRIX MT; - MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30 - MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31 - MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32 - MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33 - return MT; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4x2_t P0 = vzipq_f32( M.r[0], M.r[2] ); - float32x4x2_t P1 = vzipq_f32( M.r[1], M.r[3] ); - - float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] ); - float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] ); - - XMMATRIX mResult; - mResult.r[0] = T0.val[0]; - mResult.r[1] = T0.val[1]; - mResult.r[2] = T1.val[0]; - mResult.r[3] = T1.val[1]; - return mResult; -#elif defined(_XM_SSE_INTRINSICS_) - // x.x,x.y,y.x,y.y - XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(1,0,1,0)); - // x.z,x.w,y.z,y.w - XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(3,2,3,2)); - // z.x,z.y,w.x,w.y - XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(1,0,1,0)); - // z.z,z.w,w.z,w.w - XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(3,2,3,2)); - XMMATRIX mResult; - - // x.x,y.x,z.x,w.x - mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); - // x.y,y.y,z.y,w.y - mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); - // x.z,y.z,z.z,w.z - mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); - // x.w,y.w,z.w,w.w - mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); - return mResult; -#endif -} - -//------------------------------------------------------------------------------ -// Return the inverse and the determinant of a 4x4 matrix -_Use_decl_annotations_ -inline XMMATRIX XM_CALLCONV XMMatrixInverse -( - XMVECTOR* pDeterminant, - FXMMATRIX M -) -{ -#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) - - XMMATRIX MT = XMMatrixTranspose(M); - - XMVECTOR V0[4], V1[4]; - V0[0] = XMVectorSwizzle(MT.r[2]); - V1[0] = XMVectorSwizzle(MT.r[3]); - V0[1] = XMVectorSwizzle(MT.r[0]); - V1[1] = XMVectorSwizzle(MT.r[1]); - V0[2] = XMVectorPermute(MT.r[2], MT.r[0]); - V1[2] = XMVectorPermute(MT.r[3], MT.r[1]); - - XMVECTOR D0 = XMVectorMultiply(V0[0], V1[0]); - XMVECTOR D1 = XMVectorMultiply(V0[1], V1[1]); - XMVECTOR D2 = XMVectorMultiply(V0[2], V1[2]); - - V0[0] = XMVectorSwizzle(MT.r[2]); - V1[0] = XMVectorSwizzle(MT.r[3]); - V0[1] = XMVectorSwizzle(MT.r[0]); - V1[1] = XMVectorSwizzle(MT.r[1]); - V0[2] = XMVectorPermute(MT.r[2], MT.r[0]); - V1[2] = XMVectorPermute(MT.r[3], MT.r[1]); - - D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0); - D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1); - D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2); - - V0[0] = XMVectorSwizzle(MT.r[1]); - V1[0] = XMVectorPermute(D0, D2); - V0[1] = XMVectorSwizzle(MT.r[0]); - V1[1] = XMVectorPermute(D0, D2); - V0[2] = XMVectorSwizzle(MT.r[3]); - V1[2] = XMVectorPermute(D1, D2); - V0[3] = XMVectorSwizzle(MT.r[2]); - V1[3] = XMVectorPermute(D1, D2); - - XMVECTOR C0 = XMVectorMultiply(V0[0], V1[0]); - XMVECTOR C2 = XMVectorMultiply(V0[1], V1[1]); - XMVECTOR C4 = XMVectorMultiply(V0[2], V1[2]); - XMVECTOR C6 = XMVectorMultiply(V0[3], V1[3]); - - V0[0] = XMVectorSwizzle(MT.r[1]); - V1[0] = XMVectorPermute(D0, D2); - V0[1] = XMVectorSwizzle(MT.r[0]); - V1[1] = XMVectorPermute(D0, D2); - V0[2] = XMVectorSwizzle(MT.r[3]); - V1[2] = XMVectorPermute(D1, D2); - V0[3] = XMVectorSwizzle(MT.r[2]); - V1[3] = XMVectorPermute(D1, D2); - - C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); - C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); - C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); - C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); - - V0[0] = XMVectorSwizzle(MT.r[1]); - V1[0] = XMVectorPermute(D0, D2); - V0[1] = XMVectorSwizzle(MT.r[0]); - V1[1] = XMVectorPermute(D0, D2); - V0[2] = XMVectorSwizzle(MT.r[3]); - V1[2] = XMVectorPermute(D1, D2); - V0[3] = XMVectorSwizzle(MT.r[2]); - V1[3] = XMVectorPermute(D1, D2); - - XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); - C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0); - XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2); - C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); - XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); - C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4); - XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6); - C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); - - XMMATRIX R; - R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v); - R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v); - R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v); - R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v); - - XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]); - - if (pDeterminant != nullptr) - *pDeterminant = Determinant; - - XMVECTOR Reciprocal = XMVectorReciprocal(Determinant); - - XMMATRIX Result; - Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal); - Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal); - Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal); - Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal); - return Result; - -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX MT = XMMatrixTranspose(M); - XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,1,0,0)); - XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(3,2,3,2)); - XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(1,1,0,0)); - XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(3,2,3,2)); - XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0],_MM_SHUFFLE(2,0,2,0)); - XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1],_MM_SHUFFLE(3,1,3,1)); - - XMVECTOR D0 = _mm_mul_ps(V00,V10); - XMVECTOR D1 = _mm_mul_ps(V01,V11); - XMVECTOR D2 = _mm_mul_ps(V02,V12); - - V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(3,2,3,2)); - V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(1,1,0,0)); - V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(3,2,3,2)); - V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(1,1,0,0)); - V02 = _mm_shuffle_ps(MT.r[2],MT.r[0],_MM_SHUFFLE(3,1,3,1)); - V12 = _mm_shuffle_ps(MT.r[3],MT.r[1],_MM_SHUFFLE(2,0,2,0)); - - V00 = _mm_mul_ps(V00,V10); - V01 = _mm_mul_ps(V01,V11); - V02 = _mm_mul_ps(V02,V12); - D0 = _mm_sub_ps(D0,V00); - D1 = _mm_sub_ps(D1,V01); - D2 = _mm_sub_ps(D2,V02); - // V11 = D0Y,D0W,D2Y,D2Y - V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,1,3,1)); - V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1,0,2,1)); - V10 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(0,3,0,2)); - V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0,1,0,2)); - V11 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(2,1,2,1)); - // V13 = D1Y,D1W,D2W,D2W - XMVECTOR V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,3,3,1)); - V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1,0,2,1)); - V12 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(0,3,0,2)); - XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(0,1,0,2)); - V13 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(2,1,2,1)); - - XMVECTOR C0 = _mm_mul_ps(V00,V10); - XMVECTOR C2 = _mm_mul_ps(V01,V11); - XMVECTOR C4 = _mm_mul_ps(V02,V12); - XMVECTOR C6 = _mm_mul_ps(V03,V13); - - // V11 = D0X,D0Y,D2X,D2X - V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(0,0,1,0)); - V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2,1,3,2)); - V10 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(2,1,0,3)); - V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1,3,2,3)); - V11 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(0,2,1,2)); - // V13 = D1X,D1Y,D2Z,D2Z - V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(2,2,1,0)); - V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2,1,3,2)); - V12 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(2,1,0,3)); - V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,3,2,3)); - V13 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(0,2,1,2)); - - V00 = _mm_mul_ps(V00,V10); - V01 = _mm_mul_ps(V01,V11); - V02 = _mm_mul_ps(V02,V12); - V03 = _mm_mul_ps(V03,V13); - C0 = _mm_sub_ps(C0,V00); - C2 = _mm_sub_ps(C2,V01); - C4 = _mm_sub_ps(C4,V02); - C6 = _mm_sub_ps(C6,V03); - - V00 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(0,3,0,3)); - // V10 = D0Z,D0Z,D2X,D2Y - V10 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,2,2)); - V10 = XM_PERMUTE_PS(V10,_MM_SHUFFLE(0,2,3,0)); - V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(2,0,3,1)); - // V11 = D0X,D0W,D2X,D2Y - V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,3,0)); - V11 = XM_PERMUTE_PS(V11,_MM_SHUFFLE(2,1,0,3)); - V02 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(0,3,0,3)); - // V12 = D1Z,D1Z,D2Z,D2W - V12 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,2,2)); - V12 = XM_PERMUTE_PS(V12,_MM_SHUFFLE(0,2,3,0)); - V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(2,0,3,1)); - // V13 = D1X,D1W,D2Z,D2W - V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,3,0)); - V13 = XM_PERMUTE_PS(V13,_MM_SHUFFLE(2,1,0,3)); - - V00 = _mm_mul_ps(V00,V10); - V01 = _mm_mul_ps(V01,V11); - V02 = _mm_mul_ps(V02,V12); - V03 = _mm_mul_ps(V03,V13); - XMVECTOR C1 = _mm_sub_ps(C0,V00); - C0 = _mm_add_ps(C0,V00); - XMVECTOR C3 = _mm_add_ps(C2,V01); - C2 = _mm_sub_ps(C2,V01); - XMVECTOR C5 = _mm_sub_ps(C4,V02); - C4 = _mm_add_ps(C4,V02); - XMVECTOR C7 = _mm_add_ps(C6,V03); - C6 = _mm_sub_ps(C6,V03); - - C0 = _mm_shuffle_ps(C0,C1,_MM_SHUFFLE(3,1,2,0)); - C2 = _mm_shuffle_ps(C2,C3,_MM_SHUFFLE(3,1,2,0)); - C4 = _mm_shuffle_ps(C4,C5,_MM_SHUFFLE(3,1,2,0)); - C6 = _mm_shuffle_ps(C6,C7,_MM_SHUFFLE(3,1,2,0)); - C0 = XM_PERMUTE_PS(C0,_MM_SHUFFLE(3,1,2,0)); - C2 = XM_PERMUTE_PS(C2,_MM_SHUFFLE(3,1,2,0)); - C4 = XM_PERMUTE_PS(C4,_MM_SHUFFLE(3,1,2,0)); - C6 = XM_PERMUTE_PS(C6,_MM_SHUFFLE(3,1,2,0)); - // Get the determinate - XMVECTOR vTemp = XMVector4Dot(C0,MT.r[0]); - if (pDeterminant != nullptr) - *pDeterminant = vTemp; - vTemp = _mm_div_ps(g_XMOne,vTemp); - XMMATRIX mResult; - mResult.r[0] = _mm_mul_ps(C0,vTemp); - mResult.r[1] = _mm_mul_ps(C2,vTemp); - mResult.r[2] = _mm_mul_ps(C4,vTemp); - mResult.r[3] = _mm_mul_ps(C6,vTemp); - return mResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMMatrixDeterminant -( - FXMMATRIX M -) -{ - static const XMVECTORF32 Sign = {1.0f, -1.0f, 1.0f, -1.0f}; - - XMVECTOR V0 = XMVectorSwizzle(M.r[2]); - XMVECTOR V1 = XMVectorSwizzle(M.r[3]); - XMVECTOR V2 = XMVectorSwizzle(M.r[2]); - XMVECTOR V3 = XMVectorSwizzle(M.r[3]); - XMVECTOR V4 = XMVectorSwizzle(M.r[2]); - XMVECTOR V5 = XMVectorSwizzle(M.r[3]); - - XMVECTOR P0 = XMVectorMultiply(V0, V1); - XMVECTOR P1 = XMVectorMultiply(V2, V3); - XMVECTOR P2 = XMVectorMultiply(V4, V5); - - V0 = XMVectorSwizzle(M.r[2]); - V1 = XMVectorSwizzle(M.r[3]); - V2 = XMVectorSwizzle(M.r[2]); - V3 = XMVectorSwizzle(M.r[3]); - V4 = XMVectorSwizzle(M.r[2]); - V5 = XMVectorSwizzle(M.r[3]); - - P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0); - P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1); - P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2); - - V0 = XMVectorSwizzle(M.r[1]); - V1 = XMVectorSwizzle(M.r[1]); - V2 = XMVectorSwizzle(M.r[1]); - - XMVECTOR S = XMVectorMultiply(M.r[0], Sign.v); - XMVECTOR R = XMVectorMultiply(V0, P0); - R = XMVectorNegativeMultiplySubtract(V1, P1, R); - R = XMVectorMultiplyAdd(V2, P2, R); - - return XMVector4Dot(S, R); -} - -#define XM3RANKDECOMPOSE(a, b, c, x, y, z) \ - if((x) < (y)) \ - { \ - if((y) < (z)) \ - { \ - (a) = 2; \ - (b) = 1; \ - (c) = 0; \ - } \ - else \ - { \ - (a) = 1; \ - \ - if((x) < (z)) \ - { \ - (b) = 2; \ - (c) = 0; \ - } \ - else \ - { \ - (b) = 0; \ - (c) = 2; \ - } \ - } \ - } \ - else \ - { \ - if((x) < (z)) \ - { \ - (a) = 2; \ - (b) = 0; \ - (c) = 1; \ - } \ - else \ - { \ - (a) = 0; \ - \ - if((y) < (z)) \ - { \ - (b) = 2; \ - (c) = 1; \ - } \ - else \ - { \ - (b) = 1; \ - (c) = 2; \ - } \ - } \ - } - -#define XM3_DECOMP_EPSILON 0.0001f - -_Use_decl_annotations_ -inline bool XM_CALLCONV XMMatrixDecompose -( - XMVECTOR *outScale, - XMVECTOR *outRotQuat, - XMVECTOR *outTrans, - FXMMATRIX M -) -{ - static const XMVECTOR *pvCanonicalBasis[3] = { - &g_XMIdentityR0.v, - &g_XMIdentityR1.v, - &g_XMIdentityR2.v - }; - - assert( outScale != nullptr ); - assert( outRotQuat != nullptr ); - assert( outTrans != nullptr ); - - // Get the translation - outTrans[0] = M.r[3]; - - XMVECTOR *ppvBasis[3]; - XMMATRIX matTemp; - ppvBasis[0] = &matTemp.r[0]; - ppvBasis[1] = &matTemp.r[1]; - ppvBasis[2] = &matTemp.r[2]; - - matTemp.r[0] = M.r[0]; - matTemp.r[1] = M.r[1]; - matTemp.r[2] = M.r[2]; - matTemp.r[3] = g_XMIdentityR3.v; - - float *pfScales = (float *)outScale; - - size_t a, b, c; - XMVectorGetXPtr(&pfScales[0],XMVector3Length(ppvBasis[0][0])); - XMVectorGetXPtr(&pfScales[1],XMVector3Length(ppvBasis[1][0])); - XMVectorGetXPtr(&pfScales[2],XMVector3Length(ppvBasis[2][0])); - pfScales[3] = 0.f; - - XM3RANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2]) - - if(pfScales[a] < XM3_DECOMP_EPSILON) - { - ppvBasis[a][0] = pvCanonicalBasis[a][0]; - } - ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]); - - if(pfScales[b] < XM3_DECOMP_EPSILON) - { - size_t aa, bb, cc; - float fAbsX, fAbsY, fAbsZ; - - fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0])); - fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0])); - fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0])); - - XM3RANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ) - - ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0],pvCanonicalBasis[cc][0]); - } - - ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]); - - if(pfScales[c] < XM3_DECOMP_EPSILON) - { - ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0],ppvBasis[b][0]); - } - - ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]); - - float fDet = XMVectorGetX(XMMatrixDeterminant(matTemp)); - - // use Kramer's rule to check for handedness of coordinate system - if(fDet < 0.0f) - { - // switch coordinate system by negating the scale and inverting the basis vector on the x-axis - pfScales[a] = -pfScales[a]; - ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]); - - fDet = -fDet; - } - - fDet -= 1.0f; - fDet *= fDet; - - if(XM3_DECOMP_EPSILON < fDet) - { - // Non-SRT matrix encountered - return false; - } - - // generate the quaternion from the matrix - outRotQuat[0] = XMQuaternionRotationMatrix(matTemp); - return true; -} - -#undef XM3_DECOMP_EPSILON -#undef XM3RANKDECOMPOSE - -//------------------------------------------------------------------------------ -// Transformation operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixIdentity() -{ - XMMATRIX M; - M.r[0] = g_XMIdentityR0.v; - M.r[1] = g_XMIdentityR1.v; - M.r[2] = g_XMIdentityR2.v; - M.r[3] = g_XMIdentityR3.v; - return M; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixSet -( - float m00, float m01, float m02, float m03, - float m10, float m11, float m12, float m13, - float m20, float m21, float m22, float m23, - float m30, float m31, float m32, float m33 -) -{ - XMMATRIX M; -#if defined(_XM_NO_INTRINSICS_) - M.m[0][0] = m00; M.m[0][1] = m01; M.m[0][2] = m02; M.m[0][3] = m03; - M.m[1][0] = m10; M.m[1][1] = m11; M.m[1][2] = m12; M.m[1][3] = m13; - M.m[2][0] = m20; M.m[2][1] = m21; M.m[2][2] = m22; M.m[2][3] = m23; - M.m[3][0] = m30; M.m[3][1] = m31; M.m[3][2] = m32; M.m[3][3] = m33; -#else - M.r[0] = XMVectorSet(m00, m01, m02, m03); - M.r[1] = XMVectorSet(m10, m11, m12, m13); - M.r[2] = XMVectorSet(m20, m21, m22, m23); - M.r[3] = XMVectorSet(m30, m31, m32, m33); -#endif - return M; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixTranslation -( - float OffsetX, - float OffsetY, - float OffsetZ -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMMATRIX M; - M.m[0][0] = 1.0f; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = 1.0f; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = 1.0f; - M.m[2][3] = 0.0f; - - M.m[3][0] = OffsetX; - M.m[3][1] = OffsetY; - M.m[3][2] = OffsetZ; - M.m[3][3] = 1.0f; - return M; - -#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) - XMMATRIX M; - M.r[0] = g_XMIdentityR0.v; - M.r[1] = g_XMIdentityR1.v; - M.r[2] = g_XMIdentityR2.v; - M.r[3] = XMVectorSet(OffsetX, OffsetY, OffsetZ, 1.f ); - return M; -#endif -} - - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector -( - FXMVECTOR Offset -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMMATRIX M; - M.m[0][0] = 1.0f; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = 1.0f; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = 1.0f; - M.m[2][3] = 0.0f; - - M.m[3][0] = Offset.vector4_f32[0]; - M.m[3][1] = Offset.vector4_f32[1]; - M.m[3][2] = Offset.vector4_f32[2]; - M.m[3][3] = 1.0f; - return M; - -#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) - XMMATRIX M; - M.r[0] = g_XMIdentityR0.v; - M.r[1] = g_XMIdentityR1.v; - M.r[2] = g_XMIdentityR2.v; - M.r[3] = XMVectorSelect( g_XMIdentityR3.v, Offset, g_XMSelect1110.v ); - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixScaling -( - float ScaleX, - float ScaleY, - float ScaleZ -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMMATRIX M; - M.m[0][0] = ScaleX; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = ScaleY; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = ScaleZ; - M.m[2][3] = 0.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = 0.0f; - M.m[3][3] = 1.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - const XMVECTOR Zero = vdupq_n_f32(0); - XMMATRIX M; - M.r[0] = vsetq_lane_f32( ScaleX, Zero, 0 ); - M.r[1] = vsetq_lane_f32( ScaleY, Zero, 1 ); - M.r[2] = vsetq_lane_f32( ScaleZ, Zero, 2 ); - M.r[3] = g_XMIdentityR3.v; - return M; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX M; - M.r[0] = _mm_set_ps( 0, 0, 0, ScaleX ); - M.r[1] = _mm_set_ps( 0, 0, ScaleY, 0 ); - M.r[2] = _mm_set_ps( 0, ScaleZ, 0, 0 ); - M.r[3] = g_XMIdentityR3.v; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector -( - FXMVECTOR Scale -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMMATRIX M; - M.m[0][0] = Scale.vector4_f32[0]; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = Scale.vector4_f32[1]; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = Scale.vector4_f32[2]; - M.m[2][3] = 0.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = 0.0f; - M.m[3][3] = 1.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMMATRIX M; - M.r[0] = vandq_u32(Scale,g_XMMaskX); - M.r[1] = vandq_u32(Scale,g_XMMaskY); - M.r[2] = vandq_u32(Scale,g_XMMaskZ); - M.r[3] = g_XMIdentityR3.v; - return M; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX M; - M.r[0] = _mm_and_ps(Scale,g_XMMaskX); - M.r[1] = _mm_and_ps(Scale,g_XMMaskY); - M.r[2] = _mm_and_ps(Scale,g_XMMaskZ); - M.r[3] = g_XMIdentityR3.v; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixRotationX -( - float Angle -) -{ -#if defined(_XM_NO_INTRINSICS_) - - float fSinAngle; - float fCosAngle; - XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); - - XMMATRIX M; - M.m[0][0] = 1.0f; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = fCosAngle; - M.m[1][2] = fSinAngle; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = -fSinAngle; - M.m[2][2] = fCosAngle; - M.m[2][3] = 0.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = 0.0f; - M.m[3][3] = 1.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float fSinAngle; - float fCosAngle; - XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); - - const XMVECTOR Zero = vdupq_n_f32(0); - - XMVECTOR T1 = vsetq_lane_f32( fCosAngle, Zero, 1 ); - T1 = vsetq_lane_f32( fSinAngle, T1, 2 ); - - XMVECTOR T2 = vsetq_lane_f32( -fSinAngle, Zero, 1 ); - T2 = vsetq_lane_f32( fCosAngle, T2, 2 ); - - XMMATRIX M; - M.r[0] = g_XMIdentityR0.v; - M.r[1] = T1; - M.r[2] = T2; - M.r[3] = g_XMIdentityR3.v; - return M; -#elif defined(_XM_SSE_INTRINSICS_) - float SinAngle; - float CosAngle; - XMScalarSinCos(&SinAngle, &CosAngle, Angle); - - XMVECTOR vSin = _mm_set_ss(SinAngle); - XMVECTOR vCos = _mm_set_ss(CosAngle); - // x = 0,y = cos,z = sin, w = 0 - vCos = _mm_shuffle_ps(vCos,vSin,_MM_SHUFFLE(3,0,0,3)); - XMMATRIX M; - M.r[0] = g_XMIdentityR0; - M.r[1] = vCos; - // x = 0,y = sin,z = cos, w = 0 - vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,1,2,0)); - // x = 0,y = -sin,z = cos, w = 0 - vCos = _mm_mul_ps(vCos,g_XMNegateY); - M.r[2] = vCos; - M.r[3] = g_XMIdentityR3; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixRotationY -( - float Angle -) -{ -#if defined(_XM_NO_INTRINSICS_) - - float fSinAngle; - float fCosAngle; - XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); - - XMMATRIX M; - M.m[0][0] = fCosAngle; - M.m[0][1] = 0.0f; - M.m[0][2] = -fSinAngle; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = 1.0f; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = fSinAngle; - M.m[2][1] = 0.0f; - M.m[2][2] = fCosAngle; - M.m[2][3] = 0.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = 0.0f; - M.m[3][3] = 1.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float fSinAngle; - float fCosAngle; - XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); - - const XMVECTOR Zero = vdupq_n_f32(0); - - XMVECTOR T0 = vsetq_lane_f32( fCosAngle, Zero, 0 ); - T0 = vsetq_lane_f32( -fSinAngle, T0, 2 ); - - XMVECTOR T2 = vsetq_lane_f32( fSinAngle, Zero, 0 ); - T2 = vsetq_lane_f32( fCosAngle, T2, 2 ); - - XMMATRIX M; - M.r[0] = T0; - M.r[1] = g_XMIdentityR1.v; - M.r[2] = T2; - M.r[3] = g_XMIdentityR3.v; - return M; -#elif defined(_XM_SSE_INTRINSICS_) - float SinAngle; - float CosAngle; - XMScalarSinCos(&SinAngle, &CosAngle, Angle); - - XMVECTOR vSin = _mm_set_ss(SinAngle); - XMVECTOR vCos = _mm_set_ss(CosAngle); - // x = sin,y = 0,z = cos, w = 0 - vSin = _mm_shuffle_ps(vSin,vCos,_MM_SHUFFLE(3,0,3,0)); - XMMATRIX M; - M.r[2] = vSin; - M.r[1] = g_XMIdentityR1; - // x = cos,y = 0,z = sin, w = 0 - vSin = XM_PERMUTE_PS(vSin,_MM_SHUFFLE(3,0,1,2)); - // x = cos,y = 0,z = -sin, w = 0 - vSin = _mm_mul_ps(vSin,g_XMNegateZ); - M.r[0] = vSin; - M.r[3] = g_XMIdentityR3; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixRotationZ -( - float Angle -) -{ -#if defined(_XM_NO_INTRINSICS_) - - float fSinAngle; - float fCosAngle; - XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); - - XMMATRIX M; - M.m[0][0] = fCosAngle; - M.m[0][1] = fSinAngle; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = -fSinAngle; - M.m[1][1] = fCosAngle; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = 1.0f; - M.m[2][3] = 0.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = 0.0f; - M.m[3][3] = 1.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float fSinAngle; - float fCosAngle; - XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); - - const XMVECTOR Zero = vdupq_n_f32(0); - - XMVECTOR T0 = vsetq_lane_f32( fCosAngle, Zero, 0 ); - T0 = vsetq_lane_f32( fSinAngle, T0, 1 ); - - XMVECTOR T1 = vsetq_lane_f32( -fSinAngle, Zero, 0 ); - T1 = vsetq_lane_f32( fCosAngle, T1, 1 ); - - XMMATRIX M; - M.r[0] = T0; - M.r[1] = T1; - M.r[2] = g_XMIdentityR2.v; - M.r[3] = g_XMIdentityR3.v; - return M; -#elif defined(_XM_SSE_INTRINSICS_) - float SinAngle; - float CosAngle; - XMScalarSinCos(&SinAngle, &CosAngle, Angle); - - XMVECTOR vSin = _mm_set_ss(SinAngle); - XMVECTOR vCos = _mm_set_ss(CosAngle); - // x = cos,y = sin,z = 0, w = 0 - vCos = _mm_unpacklo_ps(vCos,vSin); - XMMATRIX M; - M.r[0] = vCos; - // x = sin,y = cos,z = 0, w = 0 - vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,2,0,1)); - // x = cos,y = -sin,z = 0, w = 0 - vCos = _mm_mul_ps(vCos,g_XMNegateX); - M.r[1] = vCos; - M.r[2] = g_XMIdentityR2; - M.r[3] = g_XMIdentityR3; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw -( - float Pitch, - float Yaw, - float Roll -) -{ - XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); - return XMMatrixRotationRollPitchYawFromVector(Angles); -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector -( - FXMVECTOR Angles // -) -{ - XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles); - return XMMatrixRotationQuaternion(Q); -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixRotationNormal -( - FXMVECTOR NormalAxis, - float Angle -) -{ -#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) - - float fSinAngle; - float fCosAngle; - XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); - - XMVECTOR A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f); - - XMVECTOR C2 = XMVectorSplatZ(A); - XMVECTOR C1 = XMVectorSplatY(A); - XMVECTOR C0 = XMVectorSplatX(A); - - XMVECTOR N0 = XMVectorSwizzle(NormalAxis); - XMVECTOR N1 = XMVectorSwizzle(NormalAxis); - - XMVECTOR V0 = XMVectorMultiply(C2, N0); - V0 = XMVectorMultiply(V0, N1); - - XMVECTOR R0 = XMVectorMultiply(C2, NormalAxis); - R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1); - - XMVECTOR R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0); - XMVECTOR R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0); - - V0 = XMVectorSelect(A, R0, g_XMSelect1110.v); - XMVECTOR V1 = XMVectorPermute(R1, R2); - XMVECTOR V2 = XMVectorPermute(R1, R2); - - XMMATRIX M; - M.r[0] = XMVectorPermute(V0, V1); - M.r[1] = XMVectorPermute(V0, V1); - M.r[2] = XMVectorPermute(V0, V2); - M.r[3] = g_XMIdentityR3.v; - return M; - -#elif defined(_XM_SSE_INTRINSICS_) - float fSinAngle; - float fCosAngle; - XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); - - XMVECTOR C2 = _mm_set_ps1(1.0f - fCosAngle); - XMVECTOR C1 = _mm_set_ps1(fCosAngle); - XMVECTOR C0 = _mm_set_ps1(fSinAngle); - - XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,0,2,1)); - XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,1,0,2)); - - XMVECTOR V0 = _mm_mul_ps(C2, N0); - V0 = _mm_mul_ps(V0, N1); - - XMVECTOR R0 = _mm_mul_ps(C2, NormalAxis); - R0 = _mm_mul_ps(R0, NormalAxis); - R0 = _mm_add_ps(R0, C1); - - XMVECTOR R1 = _mm_mul_ps(C0, NormalAxis); - R1 = _mm_add_ps(R1, V0); - XMVECTOR R2 = _mm_mul_ps(C0, NormalAxis); - R2 = _mm_sub_ps(V0,R2); - - V0 = _mm_and_ps(R0,g_XMMask3); - XMVECTOR V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,1,2,0)); - V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,3,2,1)); - XMVECTOR V2 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(0,0,1,1)); - V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,2,0)); - - R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(1,0,3,0)); - R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,2,0)); - - XMMATRIX M; - M.r[0] = R2; - - R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(3,2,3,1)); - R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,0,2)); - M.r[1] = R2; - - V2 = _mm_shuffle_ps(V2,V0,_MM_SHUFFLE(3,2,1,0)); - M.r[2] = V2; - M.r[3] = g_XMIdentityR3.v; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixRotationAxis -( - FXMVECTOR Axis, - float Angle -) -{ - assert(!XMVector3Equal(Axis, XMVectorZero())); - assert(!XMVector3IsInfinite(Axis)); - - XMVECTOR Normal = XMVector3Normalize(Axis); - return XMMatrixRotationNormal(Normal, Angle); -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion -( - FXMVECTOR Quaternion -) -{ -#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) - - static const XMVECTORF32 Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f}; - - XMVECTOR Q0 = XMVectorAdd(Quaternion, Quaternion); - XMVECTOR Q1 = XMVectorMultiply(Quaternion, Q0); - - XMVECTOR V0 = XMVectorPermute(Q1, Constant1110.v); - XMVECTOR V1 = XMVectorPermute(Q1, Constant1110.v); - XMVECTOR R0 = XMVectorSubtract(Constant1110, V0); - R0 = XMVectorSubtract(R0, V1); - - V0 = XMVectorSwizzle(Quaternion); - V1 = XMVectorSwizzle(Q0); - V0 = XMVectorMultiply(V0, V1); - - V1 = XMVectorSplatW(Quaternion); - XMVECTOR V2 = XMVectorSwizzle(Q0); - V1 = XMVectorMultiply(V1, V2); - - XMVECTOR R1 = XMVectorAdd(V0, V1); - XMVECTOR R2 = XMVectorSubtract(V0, V1); - - V0 = XMVectorPermute(R1, R2); - V1 = XMVectorPermute(R1, R2); - - XMMATRIX M; - M.r[0] = XMVectorPermute(R0, V0); - M.r[1] = XMVectorPermute(R0, V0); - M.r[2] = XMVectorPermute(R0, V1); - M.r[3] = g_XMIdentityR3.v; - return M; - -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f}; - - XMVECTOR Q0 = _mm_add_ps(Quaternion,Quaternion); - XMVECTOR Q1 = _mm_mul_ps(Quaternion,Q0); - - XMVECTOR V0 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,0,0,1)); - V0 = _mm_and_ps(V0,g_XMMask3); - XMVECTOR V1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,1,2,2)); - V1 = _mm_and_ps(V1,g_XMMask3); - XMVECTOR R0 = _mm_sub_ps(Constant1110,V0); - R0 = _mm_sub_ps(R0, V1); - - V0 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,1,0,0)); - V1 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,2,1,2)); - V0 = _mm_mul_ps(V0, V1); - - V1 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,3,3,3)); - XMVECTOR V2 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,0,2,1)); - V1 = _mm_mul_ps(V1, V2); - - XMVECTOR R1 = _mm_add_ps(V0, V1); - XMVECTOR R2 = _mm_sub_ps(V0, V1); - - V0 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(1,0,2,1)); - V0 = XM_PERMUTE_PS(V0,_MM_SHUFFLE(1,3,2,0)); - V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,2,0,0)); - V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,0,2,0)); - - Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(1,0,3,0)); - Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,2,0)); - - XMMATRIX M; - M.r[0] = Q1; - - Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(3,2,3,1)); - Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,0,2)); - M.r[1] = Q1; - - Q1 = _mm_shuffle_ps(V1,R0,_MM_SHUFFLE(3,2,1,0)); - M.r[2] = Q1; - M.r[3] = g_XMIdentityR3; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixTransformation2D -( - FXMVECTOR ScalingOrigin, - float ScalingOrientation, - FXMVECTOR Scaling, - FXMVECTOR RotationOrigin, - float Rotation, - GXMVECTOR Translation -) -{ - // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * - // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; - - XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v); - XMVECTOR NegScalingOrigin = XMVectorNegate(VScalingOrigin); - - XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); - XMMATRIX MScalingOrientation = XMMatrixRotationZ(ScalingOrientation); - XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); - XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); - XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); - XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); - XMMATRIX MRotation = XMMatrixRotationZ(Rotation); - XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v); - - XMMATRIX M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); - M = XMMatrixMultiply(M, MScaling); - M = XMMatrixMultiply(M, MScalingOrientation); - M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); - M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); - M = XMMatrixMultiply(M, MRotation); - M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); - M.r[3] = XMVectorAdd(M.r[3], VTranslation); - - return M; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixTransformation -( - FXMVECTOR ScalingOrigin, - FXMVECTOR ScalingOrientationQuaternion, - FXMVECTOR Scaling, - GXMVECTOR RotationOrigin, - HXMVECTOR RotationQuaternion, - HXMVECTOR Translation -) -{ - // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * - // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; - - XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v); - XMVECTOR NegScalingOrigin = XMVectorNegate(ScalingOrigin); - - XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); - XMMATRIX MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion); - XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); - XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); - XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v); - XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); - XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v); - - XMMATRIX M; - M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); - M = XMMatrixMultiply(M, MScaling); - M = XMMatrixMultiply(M, MScalingOrientation); - M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); - M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); - M = XMMatrixMultiply(M, MRotation); - M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); - M.r[3] = XMVectorAdd(M.r[3], VTranslation); - return M; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D -( - FXMVECTOR Scaling, - FXMVECTOR RotationOrigin, - float Rotation, - FXMVECTOR Translation -) -{ - // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; - - XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); - XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); - XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); - XMMATRIX MRotation = XMMatrixRotationZ(Rotation); - XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v); - - XMMATRIX M; - M = MScaling; - M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); - M = XMMatrixMultiply(M, MRotation); - M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); - M.r[3] = XMVectorAdd(M.r[3], VTranslation); - return M; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation -( - FXMVECTOR Scaling, - FXMVECTOR RotationOrigin, - FXMVECTOR RotationQuaternion, - GXMVECTOR Translation -) -{ - // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; - - XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); - XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin,g_XMSelect1110.v); - XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); - XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation,g_XMSelect1110.v); - - XMMATRIX M; - M = MScaling; - M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); - M = XMMatrixMultiply(M, MRotation); - M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); - M.r[3] = XMVectorAdd(M.r[3], VTranslation); - return M; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixReflect -( - FXMVECTOR ReflectionPlane -) -{ - assert(!XMVector3Equal(ReflectionPlane, XMVectorZero())); - assert(!XMPlaneIsInfinite(ReflectionPlane)); - - static const XMVECTORF32 NegativeTwo = {-2.0f, -2.0f, -2.0f, 0.0f}; - - XMVECTOR P = XMPlaneNormalize(ReflectionPlane); - XMVECTOR S = XMVectorMultiply(P, NegativeTwo); - - XMVECTOR A = XMVectorSplatX(P); - XMVECTOR B = XMVectorSplatY(P); - XMVECTOR C = XMVectorSplatZ(P); - XMVECTOR D = XMVectorSplatW(P); - - XMMATRIX M; - M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v); - M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v); - M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v); - M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v); - return M; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixShadow -( - FXMVECTOR ShadowPlane, - FXMVECTOR LightPosition -) -{ - static const XMVECTORU32 Select0001 = {XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1}; - - assert(!XMVector3Equal(ShadowPlane, XMVectorZero())); - assert(!XMPlaneIsInfinite(ShadowPlane)); - - XMVECTOR P = XMPlaneNormalize(ShadowPlane); - XMVECTOR Dot = XMPlaneDot(P, LightPosition); - P = XMVectorNegate(P); - XMVECTOR D = XMVectorSplatW(P); - XMVECTOR C = XMVectorSplatZ(P); - XMVECTOR B = XMVectorSplatY(P); - XMVECTOR A = XMVectorSplatX(P); - Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v); - - XMMATRIX M; - M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot); - Dot = XMVectorRotateLeft(Dot, 1); - M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot); - Dot = XMVectorRotateLeft(Dot, 1); - M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot); - Dot = XMVectorRotateLeft(Dot, 1); - M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot); - return M; -} - -//------------------------------------------------------------------------------ -// View and projection initialization operations -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixLookAtLH -( - FXMVECTOR EyePosition, - FXMVECTOR FocusPosition, - FXMVECTOR UpDirection -) -{ - XMVECTOR EyeDirection = XMVectorSubtract(FocusPosition, EyePosition); - return XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection); -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixLookAtRH -( - FXMVECTOR EyePosition, - FXMVECTOR FocusPosition, - FXMVECTOR UpDirection -) -{ - XMVECTOR NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition); - return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixLookToLH -( - FXMVECTOR EyePosition, - FXMVECTOR EyeDirection, - FXMVECTOR UpDirection -) -{ - assert(!XMVector3Equal(EyeDirection, XMVectorZero())); - assert(!XMVector3IsInfinite(EyeDirection)); - assert(!XMVector3Equal(UpDirection, XMVectorZero())); - assert(!XMVector3IsInfinite(UpDirection)); - - XMVECTOR R2 = XMVector3Normalize(EyeDirection); - - XMVECTOR R0 = XMVector3Cross(UpDirection, R2); - R0 = XMVector3Normalize(R0); - - XMVECTOR R1 = XMVector3Cross(R2, R0); - - XMVECTOR NegEyePosition = XMVectorNegate(EyePosition); - - XMVECTOR D0 = XMVector3Dot(R0, NegEyePosition); - XMVECTOR D1 = XMVector3Dot(R1, NegEyePosition); - XMVECTOR D2 = XMVector3Dot(R2, NegEyePosition); - - XMMATRIX M; - M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v); - M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v); - M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v); - M.r[3] = g_XMIdentityR3.v; - - M = XMMatrixTranspose(M); - - return M; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixLookToRH -( - FXMVECTOR EyePosition, - FXMVECTOR EyeDirection, - FXMVECTOR UpDirection -) -{ - XMVECTOR NegEyeDirection = XMVectorNegate(EyeDirection); - return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); -} - -//------------------------------------------------------------------------------ - -#pragma prefast(push) -#pragma prefast(disable:28931, "PREfast noise: Esp:1266") - -inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH -( - float ViewWidth, - float ViewHeight, - float NearZ, - float FarZ -) -{ - assert(NearZ > 0.f && FarZ > 0.f); - assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); - assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); - assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); - -#if defined(_XM_NO_INTRINSICS_) - - float TwoNearZ = NearZ + NearZ; - float fRange = FarZ / (FarZ - NearZ); - - XMMATRIX M; - M.m[0][0] = TwoNearZ / ViewWidth; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = TwoNearZ / ViewHeight; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = fRange; - M.m[2][3] = 1.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = -fRange * NearZ; - M.m[3][3] = 0.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float TwoNearZ = NearZ + NearZ; - float fRange = FarZ / (FarZ - NearZ); - const XMVECTOR Zero = vdupq_n_f32(0); - XMMATRIX M; - M.r[0] = vsetq_lane_f32( TwoNearZ / ViewWidth, Zero, 0 ); - M.r[1] = vsetq_lane_f32( TwoNearZ / ViewHeight, Zero, 1 ); - M.r[2] = vsetq_lane_f32( fRange, g_XMIdentityR3.v, 2 ); - M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 ); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX M; - float TwoNearZ = NearZ + NearZ; - float fRange = FarZ / (FarZ - NearZ); - // Note: This is recorded on the stack - XMVECTOR rMem = { - TwoNearZ / ViewWidth, - TwoNearZ / ViewHeight, - fRange, - -fRange * NearZ - }; - // Copy from memory to SSE register - XMVECTOR vValues = rMem; - XMVECTOR vTemp = _mm_setzero_ps(); - // Copy x only - vTemp = _mm_move_ss(vTemp,vValues); - // TwoNearZ / ViewWidth,0,0,0 - M.r[0] = vTemp; - // 0,TwoNearZ / ViewHeight,0,0 - vTemp = vValues; - vTemp = _mm_and_ps(vTemp,g_XMMaskY); - M.r[1] = vTemp; - // x=fRange,y=-fRange * NearZ,0,1.0f - vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); - // 0,0,fRange,1.0f - vTemp = _mm_setzero_ps(); - vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); - M.r[2] = vTemp; - // 0,0,-fRange * NearZ,0 - vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); - M.r[3] = vTemp; - - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH -( - float ViewWidth, - float ViewHeight, - float NearZ, - float FarZ -) -{ - assert(NearZ > 0.f && FarZ > 0.f); - assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); - assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); - assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); - -#if defined(_XM_NO_INTRINSICS_) - - float TwoNearZ = NearZ + NearZ; - float fRange = FarZ / (NearZ - FarZ); - - XMMATRIX M; - M.m[0][0] = TwoNearZ / ViewWidth; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = TwoNearZ / ViewHeight; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = fRange; - M.m[2][3] = -1.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = fRange * NearZ; - M.m[3][3] = 0.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float TwoNearZ = NearZ + NearZ; - float fRange = FarZ / (NearZ - FarZ); - const XMVECTOR Zero = vdupq_n_f32(0); - - XMMATRIX M; - M.r[0] = vsetq_lane_f32( TwoNearZ / ViewWidth, Zero, 0 ); - M.r[1] = vsetq_lane_f32( TwoNearZ / ViewHeight, Zero, 1 ); - M.r[2] = vsetq_lane_f32( fRange, g_XMNegIdentityR3.v, 2 ); - M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 ); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX M; - float TwoNearZ = NearZ + NearZ; - float fRange = FarZ / (NearZ-FarZ); - // Note: This is recorded on the stack - XMVECTOR rMem = { - TwoNearZ / ViewWidth, - TwoNearZ / ViewHeight, - fRange, - fRange * NearZ - }; - // Copy from memory to SSE register - XMVECTOR vValues = rMem; - XMVECTOR vTemp = _mm_setzero_ps(); - // Copy x only - vTemp = _mm_move_ss(vTemp,vValues); - // TwoNearZ / ViewWidth,0,0,0 - M.r[0] = vTemp; - // 0,TwoNearZ / ViewHeight,0,0 - vTemp = vValues; - vTemp = _mm_and_ps(vTemp,g_XMMaskY); - M.r[1] = vTemp; - // x=fRange,y=-fRange * NearZ,0,-1.0f - vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2)); - // 0,0,fRange,-1.0f - vTemp = _mm_setzero_ps(); - vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); - M.r[2] = vTemp; - // 0,0,-fRange * NearZ,0 - vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); - M.r[3] = vTemp; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH -( - float FovAngleY, - float AspectRatio, - float NearZ, - float FarZ -) -{ - assert(NearZ > 0.f && FarZ > 0.f); - assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); - assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); - assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); - -#if defined(_XM_NO_INTRINSICS_) - - float SinFov; - float CosFov; - XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); - - float Height = CosFov / SinFov; - float Width = Height / AspectRatio; - float fRange = FarZ / (FarZ-NearZ); - - XMMATRIX M; - M.m[0][0] = Width; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = Height; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = fRange; - M.m[2][3] = 1.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = -fRange * NearZ; - M.m[3][3] = 0.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float SinFov; - float CosFov; - XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); - - float fRange = FarZ / (FarZ-NearZ); - float Height = CosFov / SinFov; - float Width = Height / AspectRatio; - const XMVECTOR Zero = vdupq_n_f32(0); - - XMMATRIX M; - M.r[0] = vsetq_lane_f32( Width, Zero, 0 ); - M.r[1] = vsetq_lane_f32( Height, Zero, 1 ); - M.r[2] = vsetq_lane_f32( fRange, g_XMIdentityR3.v, 2 ); - M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 ); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - float SinFov; - float CosFov; - XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); - - float fRange = FarZ / (FarZ-NearZ); - // Note: This is recorded on the stack - float Height = CosFov / SinFov; - XMVECTOR rMem = { - Height / AspectRatio, - Height, - fRange, - -fRange * NearZ - }; - // Copy from memory to SSE register - XMVECTOR vValues = rMem; - XMVECTOR vTemp = _mm_setzero_ps(); - // Copy x only - vTemp = _mm_move_ss(vTemp,vValues); - // CosFov / SinFov,0,0,0 - XMMATRIX M; - M.r[0] = vTemp; - // 0,Height / AspectRatio,0,0 - vTemp = vValues; - vTemp = _mm_and_ps(vTemp,g_XMMaskY); - M.r[1] = vTemp; - // x=fRange,y=-fRange * NearZ,0,1.0f - vTemp = _mm_setzero_ps(); - vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); - // 0,0,fRange,1.0f - vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); - M.r[2] = vTemp; - // 0,0,-fRange * NearZ,0.0f - vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); - M.r[3] = vTemp; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH -( - float FovAngleY, - float AspectRatio, - float NearZ, - float FarZ -) -{ - assert(NearZ > 0.f && FarZ > 0.f); - assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); - assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); - assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); - -#if defined(_XM_NO_INTRINSICS_) - - float SinFov; - float CosFov; - XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); - - float Height = CosFov / SinFov; - float Width = Height / AspectRatio; - float fRange = FarZ / (NearZ-FarZ); - - XMMATRIX M; - M.m[0][0] = Width; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = Height; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = fRange; - M.m[2][3] = -1.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = fRange * NearZ; - M.m[3][3] = 0.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float SinFov; - float CosFov; - XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); - float fRange = FarZ / (NearZ-FarZ); - float Height = CosFov / SinFov; - float Width = Height / AspectRatio; - const XMVECTOR Zero = vdupq_n_f32(0); - - XMMATRIX M; - M.r[0] = vsetq_lane_f32( Width, Zero, 0 ); - M.r[1] = vsetq_lane_f32( Height, Zero, 1 ); - M.r[2] = vsetq_lane_f32( fRange, g_XMNegIdentityR3.v, 2 ); - M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 ); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - float SinFov; - float CosFov; - XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); - float fRange = FarZ / (NearZ-FarZ); - // Note: This is recorded on the stack - float Height = CosFov / SinFov; - XMVECTOR rMem = { - Height / AspectRatio, - Height, - fRange, - fRange * NearZ - }; - // Copy from memory to SSE register - XMVECTOR vValues = rMem; - XMVECTOR vTemp = _mm_setzero_ps(); - // Copy x only - vTemp = _mm_move_ss(vTemp,vValues); - // CosFov / SinFov,0,0,0 - XMMATRIX M; - M.r[0] = vTemp; - // 0,Height / AspectRatio,0,0 - vTemp = vValues; - vTemp = _mm_and_ps(vTemp,g_XMMaskY); - M.r[1] = vTemp; - // x=fRange,y=-fRange * NearZ,0,-1.0f - vTemp = _mm_setzero_ps(); - vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2)); - // 0,0,fRange,-1.0f - vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); - M.r[2] = vTemp; - // 0,0,fRange * NearZ,0.0f - vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); - M.r[3] = vTemp; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH -( - float ViewLeft, - float ViewRight, - float ViewBottom, - float ViewTop, - float NearZ, - float FarZ -) -{ - assert(NearZ > 0.f && FarZ > 0.f); - assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); - assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); - assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); - -#if defined(_XM_NO_INTRINSICS_) - - float TwoNearZ = NearZ + NearZ; - float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); - float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); - float fRange = FarZ / (FarZ-NearZ); - - XMMATRIX M; - M.m[0][0] = TwoNearZ * ReciprocalWidth; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = TwoNearZ * ReciprocalHeight; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; - M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; - M.m[2][2] = fRange; - M.m[2][3] = 1.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = -fRange * NearZ; - M.m[3][3] = 0.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float TwoNearZ = NearZ + NearZ; - float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); - float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); - float fRange = FarZ / (FarZ-NearZ); - const XMVECTOR Zero = vdupq_n_f32(0); - - XMMATRIX M; - M.r[0] = vsetq_lane_f32( TwoNearZ * ReciprocalWidth, Zero, 0 ); - M.r[1] = vsetq_lane_f32( TwoNearZ * ReciprocalHeight, Zero, 1 ); - M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, - -(ViewTop + ViewBottom) * ReciprocalHeight, - fRange, - 1.0f); - M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 ); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX M; - float TwoNearZ = NearZ+NearZ; - float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); - float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); - float fRange = FarZ / (FarZ-NearZ); - // Note: This is recorded on the stack - XMVECTOR rMem = { - TwoNearZ*ReciprocalWidth, - TwoNearZ*ReciprocalHeight, - -fRange * NearZ, - 0 - }; - // Copy from memory to SSE register - XMVECTOR vValues = rMem; - XMVECTOR vTemp = _mm_setzero_ps(); - // Copy x only - vTemp = _mm_move_ss(vTemp,vValues); - // TwoNearZ*ReciprocalWidth,0,0,0 - M.r[0] = vTemp; - // 0,TwoNearZ*ReciprocalHeight,0,0 - vTemp = vValues; - vTemp = _mm_and_ps(vTemp,g_XMMaskY); - M.r[1] = vTemp; - // 0,0,fRange,1.0f - M.r[2] = XMVectorSet( -(ViewLeft + ViewRight) * ReciprocalWidth, - -(ViewTop + ViewBottom) * ReciprocalHeight, - fRange, - 1.0f ); - // 0,0,-fRange * NearZ,0.0f - vValues = _mm_and_ps(vValues,g_XMMaskZ); - M.r[3] = vValues; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH -( - float ViewLeft, - float ViewRight, - float ViewBottom, - float ViewTop, - float NearZ, - float FarZ -) -{ - assert(NearZ > 0.f && FarZ > 0.f); - assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); - assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); - assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); - -#if defined(_XM_NO_INTRINSICS_) - - float TwoNearZ = NearZ + NearZ; - float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); - float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); - float fRange = FarZ / (NearZ-FarZ); - - XMMATRIX M; - M.m[0][0] = TwoNearZ * ReciprocalWidth; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = TwoNearZ * ReciprocalHeight; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth; - M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight; - M.m[2][2] = fRange; - M.m[2][3] = -1.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = fRange * NearZ; - M.m[3][3] = 0.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float TwoNearZ = NearZ + NearZ; - float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); - float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); - float fRange = FarZ / (NearZ-FarZ); - const XMVECTOR Zero = vdupq_n_f32(0); - - XMMATRIX M; - M.r[0] = vsetq_lane_f32( TwoNearZ * ReciprocalWidth, Zero, 0 ); - M.r[1] = vsetq_lane_f32( TwoNearZ * ReciprocalHeight, Zero, 1 ); - M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, - (ViewTop + ViewBottom) * ReciprocalHeight, - fRange, - -1.0f); - M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 ); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX M; - float TwoNearZ = NearZ+NearZ; - float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); - float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); - float fRange = FarZ / (NearZ-FarZ); - // Note: This is recorded on the stack - XMVECTOR rMem = { - TwoNearZ*ReciprocalWidth, - TwoNearZ*ReciprocalHeight, - fRange * NearZ, - 0 - }; - // Copy from memory to SSE register - XMVECTOR vValues = rMem; - XMVECTOR vTemp = _mm_setzero_ps(); - // Copy x only - vTemp = _mm_move_ss(vTemp,vValues); - // TwoNearZ*ReciprocalWidth,0,0,0 - M.r[0] = vTemp; - // 0,TwoNearZ*ReciprocalHeight,0,0 - vTemp = vValues; - vTemp = _mm_and_ps(vTemp,g_XMMaskY); - M.r[1] = vTemp; - // 0,0,fRange,1.0f - M.r[2] = XMVectorSet( (ViewLeft + ViewRight) * ReciprocalWidth, - (ViewTop + ViewBottom) * ReciprocalHeight, - fRange, - -1.0f ); - // 0,0,-fRange * NearZ,0.0f - vValues = _mm_and_ps(vValues,g_XMMaskZ); - M.r[3] = vValues; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH -( - float ViewWidth, - float ViewHeight, - float NearZ, - float FarZ -) -{ - assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); - assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); - assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); - -#if defined(_XM_NO_INTRINSICS_) - - float fRange = 1.0f / (FarZ-NearZ); - - XMMATRIX M; - M.m[0][0] = 2.0f / ViewWidth; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = 2.0f / ViewHeight; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = fRange; - M.m[2][3] = 0.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = -fRange * NearZ; - M.m[3][3] = 1.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float fRange = 1.0f / (FarZ-NearZ); - - const XMVECTOR Zero = vdupq_n_f32(0); - XMMATRIX M; - M.r[0] = vsetq_lane_f32( 2.0f / ViewWidth, Zero, 0 ); - M.r[1] = vsetq_lane_f32( 2.0f / ViewHeight, Zero, 1 ); - M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); - M.r[3] = vsetq_lane_f32( -fRange * NearZ, g_XMIdentityR3.v, 2 ); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX M; - float fRange = 1.0f / (FarZ-NearZ); - // Note: This is recorded on the stack - XMVECTOR rMem = { - 2.0f / ViewWidth, - 2.0f / ViewHeight, - fRange, - -fRange * NearZ - }; - // Copy from memory to SSE register - XMVECTOR vValues = rMem; - XMVECTOR vTemp = _mm_setzero_ps(); - // Copy x only - vTemp = _mm_move_ss(vTemp,vValues); - // 2.0f / ViewWidth,0,0,0 - M.r[0] = vTemp; - // 0,2.0f / ViewHeight,0,0 - vTemp = vValues; - vTemp = _mm_and_ps(vTemp,g_XMMaskY); - M.r[1] = vTemp; - // x=fRange,y=-fRange * NearZ,0,1.0f - vTemp = _mm_setzero_ps(); - vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); - // 0,0,fRange,0.0f - vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0)); - M.r[2] = vTemp; - // 0,0,-fRange * NearZ,1.0f - vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0)); - M.r[3] = vTemp; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH -( - float ViewWidth, - float ViewHeight, - float NearZ, - float FarZ -) -{ - assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); - assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); - assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); - -#if defined(_XM_NO_INTRINSICS_) - - float fRange = 1.0f / (NearZ-FarZ); - - XMMATRIX M; - M.m[0][0] = 2.0f / ViewWidth; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = 2.0f / ViewHeight; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = fRange; - M.m[2][3] = 0.0f; - - M.m[3][0] = 0.0f; - M.m[3][1] = 0.0f; - M.m[3][2] = fRange * NearZ; - M.m[3][3] = 1.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float fRange = 1.0f / (NearZ-FarZ); - - const XMVECTOR Zero = vdupq_n_f32(0); - XMMATRIX M; - M.r[0] = vsetq_lane_f32( 2.0f / ViewWidth, Zero, 0 ); - M.r[1] = vsetq_lane_f32( 2.0f / ViewHeight, Zero, 1 ); - M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); - M.r[3] = vsetq_lane_f32( fRange * NearZ, g_XMIdentityR3.v, 2 ); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX M; - float fRange = 1.0f / (NearZ-FarZ); - // Note: This is recorded on the stack - XMVECTOR rMem = { - 2.0f / ViewWidth, - 2.0f / ViewHeight, - fRange, - fRange * NearZ - }; - // Copy from memory to SSE register - XMVECTOR vValues = rMem; - XMVECTOR vTemp = _mm_setzero_ps(); - // Copy x only - vTemp = _mm_move_ss(vTemp,vValues); - // 2.0f / ViewWidth,0,0,0 - M.r[0] = vTemp; - // 0,2.0f / ViewHeight,0,0 - vTemp = vValues; - vTemp = _mm_and_ps(vTemp,g_XMMaskY); - M.r[1] = vTemp; - // x=fRange,y=fRange * NearZ,0,1.0f - vTemp = _mm_setzero_ps(); - vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); - // 0,0,fRange,0.0f - vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0)); - M.r[2] = vTemp; - // 0,0,fRange * NearZ,1.0f - vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0)); - M.r[3] = vTemp; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH -( - float ViewLeft, - float ViewRight, - float ViewBottom, - float ViewTop, - float NearZ, - float FarZ -) -{ - assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); - assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); - assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); - -#if defined(_XM_NO_INTRINSICS_) - - float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); - float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); - float fRange = 1.0f / (FarZ-NearZ); - - XMMATRIX M; - M.m[0][0] = ReciprocalWidth + ReciprocalWidth; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = ReciprocalHeight + ReciprocalHeight; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = fRange; - M.m[2][3] = 0.0f; - - M.m[3][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; - M.m[3][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; - M.m[3][2] = -fRange * NearZ; - M.m[3][3] = 1.0f; - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); - float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); - float fRange = 1.0f / (FarZ-NearZ); - const XMVECTOR Zero = vdupq_n_f32(0); - XMMATRIX M; - M.r[0] = vsetq_lane_f32( ReciprocalWidth + ReciprocalWidth, Zero, 0 ); - M.r[1] = vsetq_lane_f32( ReciprocalHeight + ReciprocalHeight, Zero, 1 ); - M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); - M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, - -(ViewTop + ViewBottom) * ReciprocalHeight, - -fRange * NearZ, - 1.0f); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX M; - float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); - float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); - float fRange = 1.0f / (FarZ-NearZ); - // Note: This is recorded on the stack - XMVECTOR rMem = { - fReciprocalWidth, - fReciprocalHeight, - fRange, - 1.0f - }; - XMVECTOR rMem2 = { - -(ViewLeft + ViewRight), - -(ViewTop + ViewBottom), - -NearZ, - 1.0f - }; - // Copy from memory to SSE register - XMVECTOR vValues = rMem; - XMVECTOR vTemp = _mm_setzero_ps(); - // Copy x only - vTemp = _mm_move_ss(vTemp,vValues); - // fReciprocalWidth*2,0,0,0 - vTemp = _mm_add_ss(vTemp,vTemp); - M.r[0] = vTemp; - // 0,fReciprocalHeight*2,0,0 - vTemp = vValues; - vTemp = _mm_and_ps(vTemp,g_XMMaskY); - vTemp = _mm_add_ps(vTemp,vTemp); - M.r[1] = vTemp; - // 0,0,fRange,0.0f - vTemp = vValues; - vTemp = _mm_and_ps(vTemp,g_XMMaskZ); - M.r[2] = vTemp; - // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f - vValues = _mm_mul_ps(vValues,rMem2); - M.r[3] = vValues; - return M; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH -( - float ViewLeft, - float ViewRight, - float ViewBottom, - float ViewTop, - float NearZ, - float FarZ -) -{ - assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); - assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); - assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); - -#if defined(_XM_NO_INTRINSICS_) - - float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); - float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); - float fRange = 1.0f / (NearZ-FarZ); - - XMMATRIX M; - M.m[0][0] = ReciprocalWidth + ReciprocalWidth; - M.m[0][1] = 0.0f; - M.m[0][2] = 0.0f; - M.m[0][3] = 0.0f; - - M.m[1][0] = 0.0f; - M.m[1][1] = ReciprocalHeight + ReciprocalHeight; - M.m[1][2] = 0.0f; - M.m[1][3] = 0.0f; - - M.m[2][0] = 0.0f; - M.m[2][1] = 0.0f; - M.m[2][2] = fRange; - M.m[2][3] = 0.0f; - - M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, - -(ViewTop + ViewBottom) * ReciprocalHeight, - fRange * NearZ, - 1.0f); - return M; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); - float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); - float fRange = 1.0f / (NearZ-FarZ); - const XMVECTOR Zero = vdupq_n_f32(0); - XMMATRIX M; - M.r[0] = vsetq_lane_f32( ReciprocalWidth + ReciprocalWidth, Zero, 0 ); - M.r[1] = vsetq_lane_f32( ReciprocalHeight + ReciprocalHeight, Zero, 1 ); - M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); - M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, - -(ViewTop + ViewBottom) * ReciprocalHeight, - fRange * NearZ, - 1.0f); - return M; -#elif defined(_XM_SSE_INTRINSICS_) - XMMATRIX M; - float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); - float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); - float fRange = 1.0f / (NearZ-FarZ); - // Note: This is recorded on the stack - XMVECTOR rMem = { - fReciprocalWidth, - fReciprocalHeight, - fRange, - 1.0f - }; - XMVECTOR rMem2 = { - -(ViewLeft + ViewRight), - -(ViewTop + ViewBottom), - NearZ, - 1.0f - }; - // Copy from memory to SSE register - XMVECTOR vValues = rMem; - XMVECTOR vTemp = _mm_setzero_ps(); - // Copy x only - vTemp = _mm_move_ss(vTemp,vValues); - // fReciprocalWidth*2,0,0,0 - vTemp = _mm_add_ss(vTemp,vTemp); - M.r[0] = vTemp; - // 0,fReciprocalHeight*2,0,0 - vTemp = vValues; - vTemp = _mm_and_ps(vTemp,g_XMMaskY); - vTemp = _mm_add_ps(vTemp,vTemp); - M.r[1] = vTemp; - // 0,0,fRange,0.0f - vTemp = vValues; - vTemp = _mm_and_ps(vTemp,g_XMMaskZ); - M.r[2] = vTemp; - // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f - vValues = _mm_mul_ps(vValues,rMem2); - M.r[3] = vValues; - return M; -#endif -} - -#pragma prefast(pop) - -/**************************************************************************** - * - * XMMATRIX operators and methods - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline XMMATRIX::XMMATRIX -( - float m00, float m01, float m02, float m03, - float m10, float m11, float m12, float m13, - float m20, float m21, float m22, float m23, - float m30, float m31, float m32, float m33 -) -{ - r[0] = XMVectorSet(m00, m01, m02, m03); - r[1] = XMVectorSet(m10, m11, m12, m13); - r[2] = XMVectorSet(m20, m21, m22, m23); - r[3] = XMVectorSet(m30, m31, m32, m33); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMMATRIX::XMMATRIX -( - const float* pArray -) -{ - assert( pArray != nullptr ); - r[0] = XMLoadFloat4((const XMFLOAT4*)pArray); - r[1] = XMLoadFloat4((const XMFLOAT4*)(pArray + 4)); - r[2] = XMLoadFloat4((const XMFLOAT4*)(pArray + 8)); - r[3] = XMLoadFloat4((const XMFLOAT4*)(pArray + 12)); -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XMMATRIX::operator- () const -{ - XMMATRIX R; - R.r[0] = XMVectorNegate( r[0] ); - R.r[1] = XMVectorNegate( r[1] ); - R.r[2] = XMVectorNegate( r[2] ); - R.r[3] = XMVectorNegate( r[3] ); - return R; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX& XM_CALLCONV XMMATRIX::operator+= (FXMMATRIX M) -{ - r[0] = XMVectorAdd( r[0], M.r[0] ); - r[1] = XMVectorAdd( r[1], M.r[1] ); - r[2] = XMVectorAdd( r[2], M.r[2] ); - r[3] = XMVectorAdd( r[3], M.r[3] ); - return *this; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX& XM_CALLCONV XMMATRIX::operator-= (FXMMATRIX M) -{ - r[0] = XMVectorSubtract( r[0], M.r[0] ); - r[1] = XMVectorSubtract( r[1], M.r[1] ); - r[2] = XMVectorSubtract( r[2], M.r[2] ); - r[3] = XMVectorSubtract( r[3], M.r[3] ); - return *this; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX& XM_CALLCONV XMMATRIX::operator*=(FXMMATRIX M) -{ - *this = XMMatrixMultiply( *this, M ); - return *this; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX& XMMATRIX::operator*= (float S) -{ - r[0] = XMVectorScale( r[0], S ); - r[1] = XMVectorScale( r[1], S ); - r[2] = XMVectorScale( r[2], S ); - r[3] = XMVectorScale( r[3], S ); - return *this; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX& XMMATRIX::operator/= (float S) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vS = XMVectorReplicate( S ); - r[0] = XMVectorDivide( r[0], vS ); - r[1] = XMVectorDivide( r[1], vS ); - r[2] = XMVectorDivide( r[2], vS ); - r[3] = XMVectorDivide( r[3], vS ); - return *this; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // 2 iterations of Newton-Raphson refinement of reciprocal - float32x2_t vS = vdup_n_f32( S ); - float32x2_t R0 = vrecpe_f32( vS ); - float32x2_t S0 = vrecps_f32( R0, vS ); - R0 = vmul_f32( S0, R0 ); - S0 = vrecps_f32( R0, vS ); - R0 = vmul_f32( S0, R0 ); - float32x4_t Reciprocal = vcombine_u32(R0, R0); - r[0] = vmulq_f32( r[0], Reciprocal ); - r[1] = vmulq_f32( r[1], Reciprocal ); - r[2] = vmulq_f32( r[2], Reciprocal ); - r[3] = vmulq_f32( r[3], Reciprocal ); - return *this; -#elif defined(_XM_SSE_INTRINSICS_) - __m128 vS = _mm_set_ps1( S ); - r[0] = _mm_div_ps( r[0], vS ); - r[1] = _mm_div_ps( r[1], vS ); - r[2] = _mm_div_ps( r[2], vS ); - r[3] = _mm_div_ps( r[3], vS ); - return *this; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMATRIX::operator+ (FXMMATRIX M) const -{ - XMMATRIX R; - R.r[0] = XMVectorAdd( r[0], M.r[0] ); - R.r[1] = XMVectorAdd( r[1], M.r[1] ); - R.r[2] = XMVectorAdd( r[2], M.r[2] ); - R.r[3] = XMVectorAdd( r[3], M.r[3] ); - return R; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMATRIX::operator- (FXMMATRIX M) const -{ - XMMATRIX R; - R.r[0] = XMVectorSubtract( r[0], M.r[0] ); - R.r[1] = XMVectorSubtract( r[1], M.r[1] ); - R.r[2] = XMVectorSubtract( r[2], M.r[2] ); - R.r[3] = XMVectorSubtract( r[3], M.r[3] ); - return R; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV XMMATRIX::operator*(FXMMATRIX M) const -{ - return XMMatrixMultiply(*this, M); -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XMMATRIX::operator* (float S) const -{ - XMMATRIX R; - R.r[0] = XMVectorScale( r[0], S ); - R.r[1] = XMVectorScale( r[1], S ); - R.r[2] = XMVectorScale( r[2], S ); - R.r[3] = XMVectorScale( r[3], S ); - return R; -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XMMATRIX::operator/ (float S) const -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vS = XMVectorReplicate( S ); - XMMATRIX R; - R.r[0] = XMVectorDivide( r[0], vS ); - R.r[1] = XMVectorDivide( r[1], vS ); - R.r[2] = XMVectorDivide( r[2], vS ); - R.r[3] = XMVectorDivide( r[3], vS ); - return R; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // 2 iterations of Newton-Raphson refinement of reciprocal - float32x2_t vS = vdup_n_f32( S ); - float32x2_t R0 = vrecpe_f32( vS ); - float32x2_t S0 = vrecps_f32( R0, vS ); - R0 = vmul_f32( S0, R0 ); - S0 = vrecps_f32( R0, vS ); - R0 = vmul_f32( S0, R0 ); - float32x4_t Reciprocal = vcombine_u32(R0, R0); - XMMATRIX R; - R.r[0] = vmulq_f32( r[0], Reciprocal ); - R.r[1] = vmulq_f32( r[1], Reciprocal ); - R.r[2] = vmulq_f32( r[2], Reciprocal ); - R.r[3] = vmulq_f32( r[3], Reciprocal ); - return R; -#elif defined(_XM_SSE_INTRINSICS_) - __m128 vS = _mm_set_ps1( S ); - XMMATRIX R; - R.r[0] = _mm_div_ps( r[0], vS ); - R.r[1] = _mm_div_ps( r[1], vS ); - R.r[2] = _mm_div_ps( r[2], vS ); - R.r[3] = _mm_div_ps( r[3], vS ); - return R; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMMATRIX XM_CALLCONV operator* -( - float S, - FXMMATRIX M -) -{ - XMMATRIX R; - R.r[0] = XMVectorScale( M.r[0], S ); - R.r[1] = XMVectorScale( M.r[1], S ); - R.r[2] = XMVectorScale( M.r[2], S ); - R.r[3] = XMVectorScale( M.r[3], S ); - return R; -} - -/**************************************************************************** - * - * XMFLOAT3X3 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMFLOAT3X3::XMFLOAT3X3 -( - const float* pArray -) -{ - assert( pArray != nullptr ); - for (size_t Row = 0; Row < 3; Row++) - { - for (size_t Column = 0; Column < 3; Column++) - { - m[Row][Column] = pArray[Row * 3 + Column]; - } - } -} - -//------------------------------------------------------------------------------ - -inline XMFLOAT3X3& XMFLOAT3X3::operator= -( - const XMFLOAT3X3& Float3x3 -) -{ - _11 = Float3x3._11; - _12 = Float3x3._12; - _13 = Float3x3._13; - _21 = Float3x3._21; - _22 = Float3x3._22; - _23 = Float3x3._23; - _31 = Float3x3._31; - _32 = Float3x3._32; - _33 = Float3x3._33; - - return *this; -} - -/**************************************************************************** - * - * XMFLOAT4X3 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMFLOAT4X3::XMFLOAT4X3 -( - const float* pArray -) -{ - assert( pArray != nullptr ); - - m[0][0] = pArray[0]; - m[0][1] = pArray[1]; - m[0][2] = pArray[2]; - - m[1][0] = pArray[3]; - m[1][1] = pArray[4]; - m[1][2] = pArray[5]; - - m[2][0] = pArray[6]; - m[2][1] = pArray[7]; - m[2][2] = pArray[8]; - - m[3][0] = pArray[9]; - m[3][1] = pArray[10]; - m[3][2] = pArray[11]; -} - -//------------------------------------------------------------------------------ - -inline XMFLOAT4X3& XMFLOAT4X3::operator= -( - const XMFLOAT4X3& Float4x3 -) -{ - XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._11); - XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._22); - XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._33); - - XMStoreFloat4((XMFLOAT4*)&_11, V1); - XMStoreFloat4((XMFLOAT4*)&_22, V2); - XMStoreFloat4((XMFLOAT4*)&_33, V3); - - return *this; -} - -//------------------------------------------------------------------------------ - -inline XMFLOAT4X3A& XMFLOAT4X3A::operator= -( - const XMFLOAT4X3A& Float4x3 -) -{ - XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._11); - XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._22); - XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._33); - - XMStoreFloat4A((XMFLOAT4A*)&_11, V1); - XMStoreFloat4A((XMFLOAT4A*)&_22, V2); - XMStoreFloat4A((XMFLOAT4A*)&_33, V3); - - return *this; -} - -/**************************************************************************** - * - * XMFLOAT4X4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMFLOAT4X4::XMFLOAT4X4 -( - const float* pArray -) -{ - assert( pArray != nullptr ); - - m[0][0] = pArray[0]; - m[0][1] = pArray[1]; - m[0][2] = pArray[2]; - m[0][3] = pArray[3]; - - m[1][0] = pArray[4]; - m[1][1] = pArray[5]; - m[1][2] = pArray[6]; - m[1][3] = pArray[7]; - - m[2][0] = pArray[8]; - m[2][1] = pArray[9]; - m[2][2] = pArray[10]; - m[2][3] = pArray[11]; - - m[3][0] = pArray[12]; - m[3][1] = pArray[13]; - m[3][2] = pArray[14]; - m[3][3] = pArray[15]; -} - -//------------------------------------------------------------------------------ - -inline XMFLOAT4X4& XMFLOAT4X4::operator= -( - const XMFLOAT4X4& Float4x4 -) -{ - XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._11); - XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._21); - XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._31); - XMVECTOR V4 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._41); - - XMStoreFloat4((XMFLOAT4*)&_11, V1); - XMStoreFloat4((XMFLOAT4*)&_21, V2); - XMStoreFloat4((XMFLOAT4*)&_31, V3); - XMStoreFloat4((XMFLOAT4*)&_41, V4); - - return *this; -} - -//------------------------------------------------------------------------------ - -inline XMFLOAT4X4A& XMFLOAT4X4A::operator= -( - const XMFLOAT4X4A& Float4x4 -) -{ - XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._11); - XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._21); - XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._31); - XMVECTOR V4 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._41); - - XMStoreFloat4A((XMFLOAT4A*)&_11, V1); - XMStoreFloat4A((XMFLOAT4A*)&_21, V2); - XMStoreFloat4A((XMFLOAT4A*)&_31, V3); - XMStoreFloat4A((XMFLOAT4A*)&_41, V4); - - return *this; -} - +//------------------------------------------------------------------------------------- +// DirectXMathMatrix.inl -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Matrix + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +// Return true if any entry in the matrix is NaN +inline bool XM_CALLCONV XMMatrixIsNaN +( + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + size_t i = 16; + const uint32_t *pWork = (const uint32_t *)(&M.m[0][0]); + do { + // Fetch value into integer unit + uint32_t uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // NaN is 0x7F800001 through 0x7FFFFFFF inclusive + uTest -= 0x7F800001U; + if (uTest<0x007FFFFFU) { + break; // NaN found + } + ++pWork; // Next entry + } while (--i); + return (i!=0); // i == 0 if nothing matched +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Load in registers + XMVECTOR vX = M.r[0]; + XMVECTOR vY = M.r[1]; + XMVECTOR vZ = M.r[2]; + XMVECTOR vW = M.r[3]; + // Test themselves to check for NaN + vX = vmvnq_u32(vceqq_f32(vX, vX)); + vY = vmvnq_u32(vceqq_f32(vY, vY)); + vZ = vmvnq_u32(vceqq_f32(vZ, vZ)); + vW = vmvnq_u32(vceqq_f32(vW, vW)); + // Or all the results + vX = vorrq_u32(vX,vZ); + vY = vorrq_u32(vY,vW); + vX = vorrq_u32(vX,vY); + // If any tested true, return true + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vX), vget_high_u8(vX)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + return (r != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Load in registers + XMVECTOR vX = M.r[0]; + XMVECTOR vY = M.r[1]; + XMVECTOR vZ = M.r[2]; + XMVECTOR vW = M.r[3]; + // Test themselves to check for NaN + vX = _mm_cmpneq_ps(vX,vX); + vY = _mm_cmpneq_ps(vY,vY); + vZ = _mm_cmpneq_ps(vZ,vZ); + vW = _mm_cmpneq_ps(vW,vW); + // Or all the results + vX = _mm_or_ps(vX,vZ); + vY = _mm_or_ps(vY,vW); + vX = _mm_or_ps(vX,vY); + // If any tested true, return true + return (_mm_movemask_ps(vX)!=0); +#else +#endif +} + +//------------------------------------------------------------------------------ + +// Return true if any entry in the matrix is +/-INF +inline bool XM_CALLCONV XMMatrixIsInfinite +( + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + size_t i = 16; + const uint32_t *pWork = (const uint32_t *)(&M.m[0][0]); + do { + // Fetch value into integer unit + uint32_t uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // INF is 0x7F800000 + if (uTest==0x7F800000U) { + break; // INF found + } + ++pWork; // Next entry + } while (--i); + return (i!=0); // i == 0 if nothing matched +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bits + XMVECTOR vTemp1 = vandq_u32(M.r[0],g_XMAbsMask); + XMVECTOR vTemp2 = vandq_u32(M.r[1],g_XMAbsMask); + XMVECTOR vTemp3 = vandq_u32(M.r[2],g_XMAbsMask); + XMVECTOR vTemp4 = vandq_u32(M.r[3],g_XMAbsMask); + // Compare to infinity + vTemp1 = vceqq_f32(vTemp1,g_XMInfinity); + vTemp2 = vceqq_f32(vTemp2,g_XMInfinity); + vTemp3 = vceqq_f32(vTemp3,g_XMInfinity); + vTemp4 = vceqq_f32(vTemp4,g_XMInfinity); + // Or the answers together + vTemp1 = vorrq_u32(vTemp1,vTemp2); + vTemp3 = vorrq_u32(vTemp3,vTemp4); + vTemp1 = vorrq_u32(vTemp1,vTemp3); + // If any are infinity, the signs are true. + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + return (r != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bits + XMVECTOR vTemp1 = _mm_and_ps(M.r[0],g_XMAbsMask); + XMVECTOR vTemp2 = _mm_and_ps(M.r[1],g_XMAbsMask); + XMVECTOR vTemp3 = _mm_and_ps(M.r[2],g_XMAbsMask); + XMVECTOR vTemp4 = _mm_and_ps(M.r[3],g_XMAbsMask); + // Compare to infinity + vTemp1 = _mm_cmpeq_ps(vTemp1,g_XMInfinity); + vTemp2 = _mm_cmpeq_ps(vTemp2,g_XMInfinity); + vTemp3 = _mm_cmpeq_ps(vTemp3,g_XMInfinity); + vTemp4 = _mm_cmpeq_ps(vTemp4,g_XMInfinity); + // Or the answers together + vTemp1 = _mm_or_ps(vTemp1,vTemp2); + vTemp3 = _mm_or_ps(vTemp3,vTemp4); + vTemp1 = _mm_or_ps(vTemp1,vTemp3); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp1)!=0); +#endif +} + +//------------------------------------------------------------------------------ + +// Return true if the XMMatrix is equal to identity +inline bool XM_CALLCONV XMMatrixIsIdentity +( + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + // Use the integer pipeline to reduce branching to a minimum + const uint32_t *pWork = (const uint32_t*)(&M.m[0][0]); + // Convert 1.0f to zero and or them together + uint32_t uOne = pWork[0]^0x3F800000U; + // Or all the 0.0f entries together + uint32_t uZero = pWork[1]; + uZero |= pWork[2]; + uZero |= pWork[3]; + // 2nd row + uZero |= pWork[4]; + uOne |= pWork[5]^0x3F800000U; + uZero |= pWork[6]; + uZero |= pWork[7]; + // 3rd row + uZero |= pWork[8]; + uZero |= pWork[9]; + uOne |= pWork[10]^0x3F800000U; + uZero |= pWork[11]; + // 4th row + uZero |= pWork[12]; + uZero |= pWork[13]; + uZero |= pWork[14]; + uOne |= pWork[15]^0x3F800000U; + // If all zero entries are zero, the uZero==0 + uZero &= 0x7FFFFFFF; // Allow -0.0f + // If all 1.0f entries are 1.0f, then uOne==0 + uOne |= uZero; + return (uOne==0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vTemp1 = vceqq_f32(M.r[0],g_XMIdentityR0); + XMVECTOR vTemp2 = vceqq_f32(M.r[1],g_XMIdentityR1); + XMVECTOR vTemp3 = vceqq_f32(M.r[2],g_XMIdentityR2); + XMVECTOR vTemp4 = vceqq_f32(M.r[3],g_XMIdentityR3); + vTemp1 = vandq_u32(vTemp1,vTemp2); + vTemp3 = vandq_u32(vTemp3,vTemp4); + vTemp1 = vandq_u32(vTemp1,vTemp3); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + return ( r == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0],g_XMIdentityR0); + XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1],g_XMIdentityR1); + XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2],g_XMIdentityR2); + XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3],g_XMIdentityR3); + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + vTemp3 = _mm_and_ps(vTemp3,vTemp4); + vTemp1 = _mm_and_ps(vTemp1,vTemp3); + return (_mm_movemask_ps(vTemp1)==0x0f); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Perform a 4x4 matrix multiply by a 4x4 matrix +inline XMMATRIX XM_CALLCONV XMMatrixMultiply +( + FXMMATRIX M1, + CXMMATRIX M2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M1.m[0][0]; + float y = M1.m[0][1]; + float z = M1.m[0][2]; + float w = M1.m[0][3]; + // Perform the operation on the first row + mResult.m[0][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[0][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[0][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[0][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + // Repeat for all the other rows + x = M1.m[1][0]; + y = M1.m[1][1]; + z = M1.m[1][2]; + w = M1.m[1][3]; + mResult.m[1][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[1][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[1][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[1][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + x = M1.m[2][0]; + y = M1.m[2][1]; + z = M1.m[2][2]; + w = M1.m[2][3]; + mResult.m[2][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[2][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[2][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[2][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + x = M1.m[3][0]; + y = M1.m[3][1]; + z = M1.m[3][2]; + w = M1.m[3][3]; + mResult.m[3][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[3][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[3][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[3][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + return mResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX mResult; + float32x2_t VL = vget_low_f32( M1.r[0] ); + float32x2_t VH = vget_high_f32( M1.r[0] ); + // Perform the operation on the first row + XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0); + XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1); + XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[0] = vaddq_f32( vZ, vW ); + // Repeat for the other 3 rows + VL = vget_low_f32( M1.r[1] ); + VH = vget_high_f32( M1.r[1] ); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[1] = vaddq_f32( vZ, vW ); + VL = vget_low_f32( M1.r[2] ); + VH = vget_high_f32( M1.r[2] ); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[2] = vaddq_f32( vZ, vW ); + VL = vget_low_f32( M1.r[3] ); + VH = vget_high_f32( M1.r[3] ); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + mResult.r[3] = vaddq_f32( vZ, vW ); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX mResult; + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[0] = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[1] = vX; + vW = M1.r[2]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[2] = vX; + vW = M1.r[3]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[3] = vX; + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose +( + FXMMATRIX M1, + CXMMATRIX M2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M2.m[0][0]; + float y = M2.m[1][0]; + float z = M2.m[2][0]; + float w = M2.m[3][0]; + // Perform the operation on the first row + mResult.m[0][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[0][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[0][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[0][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + // Repeat for all the other rows + x = M2.m[0][1]; + y = M2.m[1][1]; + z = M2.m[2][1]; + w = M2.m[3][1]; + mResult.m[1][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[1][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[1][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[1][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + x = M2.m[0][2]; + y = M2.m[1][2]; + z = M2.m[2][2]; + w = M2.m[3][2]; + mResult.m[2][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[2][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[2][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[2][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + x = M2.m[0][3]; + y = M2.m[1][3]; + z = M2.m[2][3]; + w = M2.m[3][3]; + mResult.m[3][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[3][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[3][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[3][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + return mResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( M1.r[0] ); + float32x2_t VH = vget_high_f32( M1.r[0] ); + // Perform the operation on the first row + XMVECTOR vX = vmulq_lane_f32(M2.r[0], VL, 0); + XMVECTOR vY = vmulq_lane_f32(M2.r[1], VL, 1); + XMVECTOR vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + XMVECTOR vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r0 = vaddq_f32( vZ, vW ); + // Repeat for the other 3 rows + VL = vget_low_f32( M1.r[1] ); + VH = vget_high_f32( M1.r[1] ); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r1 = vaddq_f32( vZ, vW ); + VL = vget_low_f32( M1.r[2] ); + VH = vget_high_f32( M1.r[2] ); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r2 = vaddq_f32( vZ, vW ); + VL = vget_low_f32( M1.r[3] ); + VH = vget_high_f32( M1.r[3] ); + vX = vmulq_lane_f32(M2.r[0], VL, 0); + vY = vmulq_lane_f32(M2.r[1], VL, 1); + vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0); + vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1); + float32x4_t r3 = vaddq_f32( vZ, vW ); + + // Transpose result + float32x4x2_t P0 = vzipq_f32( r0, r2 ); + float32x4x2_t P1 = vzipq_f32( r1, r3 ); + + float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] ); + float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] ); + + XMMATRIX mResult; + mResult.r[0] = T0.val[0]; + mResult.r[1] = T0.val[1]; + mResult.r[2] = T1.val[0]; + mResult.r[3] = T1.val[1]; + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + __m128 r0 = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + __m128 r1 = vX; + vW = M1.r[2]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + __m128 r2 = vX; + vW = M1.r[3]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + __m128 r3 = vX; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranspose +( + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + // Original matrix: + // + // m00m01m02m03 + // m10m11m12m13 + // m20m21m22m23 + // m30m31m32m33 + + XMMATRIX P; + P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21 + P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31 + P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23 + P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33 + + XMMATRIX MT; + MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30 + MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31 + MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32 + MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33 + return MT; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32( M.r[0], M.r[2] ); + float32x4x2_t P1 = vzipq_f32( M.r[1], M.r[3] ); + + float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] ); + float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] ); + + XMMATRIX mResult; + mResult.r[0] = T0.val[0]; + mResult.r[1] = T0.val[1]; + mResult.r[2] = T1.val[0]; + mResult.r[3] = T1.val[1]; + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(3,2,3,2)); + XMMATRIX mResult; + + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ +// Return the inverse and the determinant of a 4x4 matrix +_Use_decl_annotations_ +inline XMMATRIX XM_CALLCONV XMMatrixInverse +( + XMVECTOR* pDeterminant, + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMMATRIX MT = XMMatrixTranspose(M); + + XMVECTOR V0[4], V1[4]; + V0[0] = XMVectorSwizzle(MT.r[2]); + V1[0] = XMVectorSwizzle(MT.r[3]); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorSwizzle(MT.r[1]); + V0[2] = XMVectorPermute(MT.r[2], MT.r[0]); + V1[2] = XMVectorPermute(MT.r[3], MT.r[1]); + + XMVECTOR D0 = XMVectorMultiply(V0[0], V1[0]); + XMVECTOR D1 = XMVectorMultiply(V0[1], V1[1]); + XMVECTOR D2 = XMVectorMultiply(V0[2], V1[2]); + + V0[0] = XMVectorSwizzle(MT.r[2]); + V1[0] = XMVectorSwizzle(MT.r[3]); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorSwizzle(MT.r[1]); + V0[2] = XMVectorPermute(MT.r[2], MT.r[0]); + V1[2] = XMVectorPermute(MT.r[3], MT.r[1]); + + D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0); + D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1); + D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + XMVECTOR C0 = XMVectorMultiply(V0[0], V1[0]); + XMVECTOR C2 = XMVectorMultiply(V0[1], V1[1]); + XMVECTOR C4 = XMVectorMultiply(V0[2], V1[2]); + XMVECTOR C6 = XMVectorMultiply(V0[3], V1[3]); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + V0[0] = XMVectorSwizzle(MT.r[1]); + V1[0] = XMVectorPermute(D0, D2); + V0[1] = XMVectorSwizzle(MT.r[0]); + V1[1] = XMVectorPermute(D0, D2); + V0[2] = XMVectorSwizzle(MT.r[3]); + V1[2] = XMVectorPermute(D1, D2); + V0[3] = XMVectorSwizzle(MT.r[2]); + V1[3] = XMVectorPermute(D1, D2); + + XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0); + XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4); + XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + XMMATRIX R; + R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v); + R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v); + R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v); + R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v); + + XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]); + + if (pDeterminant != nullptr) + *pDeterminant = Determinant; + + XMVECTOR Reciprocal = XMVectorReciprocal(Determinant); + + XMMATRIX Result; + Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal); + Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal); + Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal); + Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX MT = XMMatrixTranspose(M); + XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,1,0,0)); + XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(3,2,3,2)); + XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(1,1,0,0)); + XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(3,2,3,2)); + XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0],_MM_SHUFFLE(2,0,2,0)); + XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1],_MM_SHUFFLE(3,1,3,1)); + + XMVECTOR D0 = _mm_mul_ps(V00,V10); + XMVECTOR D1 = _mm_mul_ps(V01,V11); + XMVECTOR D2 = _mm_mul_ps(V02,V12); + + V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(3,2,3,2)); + V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(1,1,0,0)); + V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(3,2,3,2)); + V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(1,1,0,0)); + V02 = _mm_shuffle_ps(MT.r[2],MT.r[0],_MM_SHUFFLE(3,1,3,1)); + V12 = _mm_shuffle_ps(MT.r[3],MT.r[1],_MM_SHUFFLE(2,0,2,0)); + + V00 = _mm_mul_ps(V00,V10); + V01 = _mm_mul_ps(V01,V11); + V02 = _mm_mul_ps(V02,V12); + D0 = _mm_sub_ps(D0,V00); + D1 = _mm_sub_ps(D1,V01); + D2 = _mm_sub_ps(D2,V02); + // V11 = D0Y,D0W,D2Y,D2Y + V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,1,3,1)); + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1,0,2,1)); + V10 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(0,3,0,2)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0,1,0,2)); + V11 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(2,1,2,1)); + // V13 = D1Y,D1W,D2W,D2W + XMVECTOR V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,3,3,1)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1,0,2,1)); + V12 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(0,3,0,2)); + XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(0,1,0,2)); + V13 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(2,1,2,1)); + + XMVECTOR C0 = _mm_mul_ps(V00,V10); + XMVECTOR C2 = _mm_mul_ps(V01,V11); + XMVECTOR C4 = _mm_mul_ps(V02,V12); + XMVECTOR C6 = _mm_mul_ps(V03,V13); + + // V11 = D0X,D0Y,D2X,D2X + V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(0,0,1,0)); + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2,1,3,2)); + V10 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(2,1,0,3)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1,3,2,3)); + V11 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(0,2,1,2)); + // V13 = D1X,D1Y,D2Z,D2Z + V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(2,2,1,0)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2,1,3,2)); + V12 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(2,1,0,3)); + V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,3,2,3)); + V13 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(0,2,1,2)); + + V00 = _mm_mul_ps(V00,V10); + V01 = _mm_mul_ps(V01,V11); + V02 = _mm_mul_ps(V02,V12); + V03 = _mm_mul_ps(V03,V13); + C0 = _mm_sub_ps(C0,V00); + C2 = _mm_sub_ps(C2,V01); + C4 = _mm_sub_ps(C4,V02); + C6 = _mm_sub_ps(C6,V03); + + V00 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(0,3,0,3)); + // V10 = D0Z,D0Z,D2X,D2Y + V10 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,2,2)); + V10 = XM_PERMUTE_PS(V10,_MM_SHUFFLE(0,2,3,0)); + V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(2,0,3,1)); + // V11 = D0X,D0W,D2X,D2Y + V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,3,0)); + V11 = XM_PERMUTE_PS(V11,_MM_SHUFFLE(2,1,0,3)); + V02 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(0,3,0,3)); + // V12 = D1Z,D1Z,D2Z,D2W + V12 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,2,2)); + V12 = XM_PERMUTE_PS(V12,_MM_SHUFFLE(0,2,3,0)); + V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(2,0,3,1)); + // V13 = D1X,D1W,D2Z,D2W + V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,3,0)); + V13 = XM_PERMUTE_PS(V13,_MM_SHUFFLE(2,1,0,3)); + + V00 = _mm_mul_ps(V00,V10); + V01 = _mm_mul_ps(V01,V11); + V02 = _mm_mul_ps(V02,V12); + V03 = _mm_mul_ps(V03,V13); + XMVECTOR C1 = _mm_sub_ps(C0,V00); + C0 = _mm_add_ps(C0,V00); + XMVECTOR C3 = _mm_add_ps(C2,V01); + C2 = _mm_sub_ps(C2,V01); + XMVECTOR C5 = _mm_sub_ps(C4,V02); + C4 = _mm_add_ps(C4,V02); + XMVECTOR C7 = _mm_add_ps(C6,V03); + C6 = _mm_sub_ps(C6,V03); + + C0 = _mm_shuffle_ps(C0,C1,_MM_SHUFFLE(3,1,2,0)); + C2 = _mm_shuffle_ps(C2,C3,_MM_SHUFFLE(3,1,2,0)); + C4 = _mm_shuffle_ps(C4,C5,_MM_SHUFFLE(3,1,2,0)); + C6 = _mm_shuffle_ps(C6,C7,_MM_SHUFFLE(3,1,2,0)); + C0 = XM_PERMUTE_PS(C0,_MM_SHUFFLE(3,1,2,0)); + C2 = XM_PERMUTE_PS(C2,_MM_SHUFFLE(3,1,2,0)); + C4 = XM_PERMUTE_PS(C4,_MM_SHUFFLE(3,1,2,0)); + C6 = XM_PERMUTE_PS(C6,_MM_SHUFFLE(3,1,2,0)); + // Get the determinate + XMVECTOR vTemp = XMVector4Dot(C0,MT.r[0]); + if (pDeterminant != nullptr) + *pDeterminant = vTemp; + vTemp = _mm_div_ps(g_XMOne,vTemp); + XMMATRIX mResult; + mResult.r[0] = _mm_mul_ps(C0,vTemp); + mResult.r[1] = _mm_mul_ps(C2,vTemp); + mResult.r[2] = _mm_mul_ps(C4,vTemp); + mResult.r[3] = _mm_mul_ps(C6,vTemp); + return mResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMMatrixDeterminant +( + FXMMATRIX M +) +{ + static const XMVECTORF32 Sign = {1.0f, -1.0f, 1.0f, -1.0f}; + + XMVECTOR V0 = XMVectorSwizzle(M.r[2]); + XMVECTOR V1 = XMVectorSwizzle(M.r[3]); + XMVECTOR V2 = XMVectorSwizzle(M.r[2]); + XMVECTOR V3 = XMVectorSwizzle(M.r[3]); + XMVECTOR V4 = XMVectorSwizzle(M.r[2]); + XMVECTOR V5 = XMVectorSwizzle(M.r[3]); + + XMVECTOR P0 = XMVectorMultiply(V0, V1); + XMVECTOR P1 = XMVectorMultiply(V2, V3); + XMVECTOR P2 = XMVectorMultiply(V4, V5); + + V0 = XMVectorSwizzle(M.r[2]); + V1 = XMVectorSwizzle(M.r[3]); + V2 = XMVectorSwizzle(M.r[2]); + V3 = XMVectorSwizzle(M.r[3]); + V4 = XMVectorSwizzle(M.r[2]); + V5 = XMVectorSwizzle(M.r[3]); + + P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0); + P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1); + P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2); + + V0 = XMVectorSwizzle(M.r[1]); + V1 = XMVectorSwizzle(M.r[1]); + V2 = XMVectorSwizzle(M.r[1]); + + XMVECTOR S = XMVectorMultiply(M.r[0], Sign.v); + XMVECTOR R = XMVectorMultiply(V0, P0); + R = XMVectorNegativeMultiplySubtract(V1, P1, R); + R = XMVectorMultiplyAdd(V2, P2, R); + + return XMVector4Dot(S, R); +} + +#define XM3RANKDECOMPOSE(a, b, c, x, y, z) \ + if((x) < (y)) \ + { \ + if((y) < (z)) \ + { \ + (a) = 2; \ + (b) = 1; \ + (c) = 0; \ + } \ + else \ + { \ + (a) = 1; \ + \ + if((x) < (z)) \ + { \ + (b) = 2; \ + (c) = 0; \ + } \ + else \ + { \ + (b) = 0; \ + (c) = 2; \ + } \ + } \ + } \ + else \ + { \ + if((x) < (z)) \ + { \ + (a) = 2; \ + (b) = 0; \ + (c) = 1; \ + } \ + else \ + { \ + (a) = 0; \ + \ + if((y) < (z)) \ + { \ + (b) = 2; \ + (c) = 1; \ + } \ + else \ + { \ + (b) = 1; \ + (c) = 2; \ + } \ + } \ + } + +#define XM3_DECOMP_EPSILON 0.0001f + +_Use_decl_annotations_ +inline bool XM_CALLCONV XMMatrixDecompose +( + XMVECTOR *outScale, + XMVECTOR *outRotQuat, + XMVECTOR *outTrans, + FXMMATRIX M +) +{ + static const XMVECTOR *pvCanonicalBasis[3] = { + &g_XMIdentityR0.v, + &g_XMIdentityR1.v, + &g_XMIdentityR2.v + }; + + assert( outScale != nullptr ); + assert( outRotQuat != nullptr ); + assert( outTrans != nullptr ); + + // Get the translation + outTrans[0] = M.r[3]; + + XMVECTOR *ppvBasis[3]; + XMMATRIX matTemp; + ppvBasis[0] = &matTemp.r[0]; + ppvBasis[1] = &matTemp.r[1]; + ppvBasis[2] = &matTemp.r[2]; + + matTemp.r[0] = M.r[0]; + matTemp.r[1] = M.r[1]; + matTemp.r[2] = M.r[2]; + matTemp.r[3] = g_XMIdentityR3.v; + + float *pfScales = (float *)outScale; + + size_t a, b, c; + XMVectorGetXPtr(&pfScales[0],XMVector3Length(ppvBasis[0][0])); + XMVectorGetXPtr(&pfScales[1],XMVector3Length(ppvBasis[1][0])); + XMVectorGetXPtr(&pfScales[2],XMVector3Length(ppvBasis[2][0])); + pfScales[3] = 0.f; + + XM3RANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2]) + + if(pfScales[a] < XM3_DECOMP_EPSILON) + { + ppvBasis[a][0] = pvCanonicalBasis[a][0]; + } + ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]); + + if(pfScales[b] < XM3_DECOMP_EPSILON) + { + size_t aa, bb, cc; + float fAbsX, fAbsY, fAbsZ; + + fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0])); + fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0])); + fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0])); + + XM3RANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ) + + ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0],pvCanonicalBasis[cc][0]); + } + + ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]); + + if(pfScales[c] < XM3_DECOMP_EPSILON) + { + ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0],ppvBasis[b][0]); + } + + ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]); + + float fDet = XMVectorGetX(XMMatrixDeterminant(matTemp)); + + // use Kramer's rule to check for handedness of coordinate system + if(fDet < 0.0f) + { + // switch coordinate system by negating the scale and inverting the basis vector on the x-axis + pfScales[a] = -pfScales[a]; + ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]); + + fDet = -fDet; + } + + fDet -= 1.0f; + fDet *= fDet; + + if(XM3_DECOMP_EPSILON < fDet) + { + // Non-SRT matrix encountered + return false; + } + + // generate the quaternion from the matrix + outRotQuat[0] = XMQuaternionRotationMatrix(matTemp); + return true; +} + +#undef XM3_DECOMP_EPSILON +#undef XM3RANKDECOMPOSE + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixIdentity() +{ + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixSet +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) +{ + XMMATRIX M; +#if defined(_XM_NO_INTRINSICS_) + M.m[0][0] = m00; M.m[0][1] = m01; M.m[0][2] = m02; M.m[0][3] = m03; + M.m[1][0] = m10; M.m[1][1] = m11; M.m[1][2] = m12; M.m[1][3] = m13; + M.m[2][0] = m20; M.m[2][1] = m21; M.m[2][2] = m22; M.m[2][3] = m23; + M.m[3][0] = m30; M.m[3][1] = m31; M.m[3][2] = m32; M.m[3][3] = m33; +#else + M.r[0] = XMVectorSet(m00, m01, m02, m03); + M.r[1] = XMVectorSet(m10, m11, m12, m13); + M.r[2] = XMVectorSet(m20, m21, m22, m23); + M.r[3] = XMVectorSet(m30, m31, m32, m33); +#endif + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranslation +( + float OffsetX, + float OffsetY, + float OffsetZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = OffsetX; + M.m[3][1] = OffsetY; + M.m[3][2] = OffsetZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = XMVectorSet(OffsetX, OffsetY, OffsetZ, 1.f ); + return M; +#endif +} + + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector +( + FXMVECTOR Offset +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = Offset.vector4_f32[0]; + M.m[3][1] = Offset.vector4_f32[1]; + M.m[3][2] = Offset.vector4_f32[2]; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = XMVectorSelect( g_XMIdentityR3.v, Offset, g_XMSelect1110.v ); + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixScaling +( + float ScaleX, + float ScaleY, + float ScaleZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = ScaleX; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ScaleY; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = ScaleZ; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( ScaleX, Zero, 0 ); + M.r[1] = vsetq_lane_f32( ScaleY, Zero, 1 ); + M.r[2] = vsetq_lane_f32( ScaleZ, Zero, 2 ); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_set_ps( 0, 0, 0, ScaleX ); + M.r[1] = _mm_set_ps( 0, 0, ScaleY, 0 ); + M.r[2] = _mm_set_ps( 0, ScaleZ, 0, 0 ); + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector +( + FXMVECTOR Scale +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = Scale.vector4_f32[0]; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Scale.vector4_f32[1]; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = Scale.vector4_f32[2]; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vandq_u32(Scale,g_XMMaskX); + M.r[1] = vandq_u32(Scale,g_XMMaskY); + M.r[2] = vandq_u32(Scale,g_XMMaskZ); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_and_ps(Scale,g_XMMaskX); + M.r[1] = _mm_and_ps(Scale,g_XMMaskY); + M.r[2] = _mm_and_ps(Scale,g_XMMaskZ); + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationX +( + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = fCosAngle; + M.m[1][2] = fSinAngle; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = -fSinAngle; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const XMVECTOR Zero = vdupq_n_f32(0); + + XMVECTOR T1 = vsetq_lane_f32( fCosAngle, Zero, 1 ); + T1 = vsetq_lane_f32( fSinAngle, T1, 2 ); + + XMVECTOR T2 = vsetq_lane_f32( -fSinAngle, Zero, 1 ); + T2 = vsetq_lane_f32( fCosAngle, T2, 2 ); + + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = T1; + M.r[2] = T2; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = 0,y = cos,z = sin, w = 0 + vCos = _mm_shuffle_ps(vCos,vSin,_MM_SHUFFLE(3,0,0,3)); + XMMATRIX M; + M.r[0] = g_XMIdentityR0; + M.r[1] = vCos; + // x = 0,y = sin,z = cos, w = 0 + vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,1,2,0)); + // x = 0,y = -sin,z = cos, w = 0 + vCos = _mm_mul_ps(vCos,g_XMNegateY); + M.r[2] = vCos; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationY +( + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = fCosAngle; + M.m[0][1] = 0.0f; + M.m[0][2] = -fSinAngle; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = fSinAngle; + M.m[2][1] = 0.0f; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const XMVECTOR Zero = vdupq_n_f32(0); + + XMVECTOR T0 = vsetq_lane_f32( fCosAngle, Zero, 0 ); + T0 = vsetq_lane_f32( -fSinAngle, T0, 2 ); + + XMVECTOR T2 = vsetq_lane_f32( fSinAngle, Zero, 0 ); + T2 = vsetq_lane_f32( fCosAngle, T2, 2 ); + + XMMATRIX M; + M.r[0] = T0; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = T2; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = sin,y = 0,z = cos, w = 0 + vSin = _mm_shuffle_ps(vSin,vCos,_MM_SHUFFLE(3,0,3,0)); + XMMATRIX M; + M.r[2] = vSin; + M.r[1] = g_XMIdentityR1; + // x = cos,y = 0,z = sin, w = 0 + vSin = XM_PERMUTE_PS(vSin,_MM_SHUFFLE(3,0,1,2)); + // x = cos,y = 0,z = -sin, w = 0 + vSin = _mm_mul_ps(vSin,g_XMNegateZ); + M.r[0] = vSin; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationZ +( + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = fCosAngle; + M.m[0][1] = fSinAngle; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = -fSinAngle; + M.m[1][1] = fCosAngle; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const XMVECTOR Zero = vdupq_n_f32(0); + + XMVECTOR T0 = vsetq_lane_f32( fCosAngle, Zero, 0 ); + T0 = vsetq_lane_f32( fSinAngle, T0, 1 ); + + XMVECTOR T1 = vsetq_lane_f32( -fSinAngle, Zero, 0 ); + T1 = vsetq_lane_f32( fCosAngle, T1, 1 ); + + XMMATRIX M; + M.r[0] = T0; + M.r[1] = T1; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = cos,y = sin,z = 0, w = 0 + vCos = _mm_unpacklo_ps(vCos,vSin); + XMMATRIX M; + M.r[0] = vCos; + // x = sin,y = cos,z = 0, w = 0 + vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,2,0,1)); + // x = cos,y = -sin,z = 0, w = 0 + vCos = _mm_mul_ps(vCos,g_XMNegateX); + M.r[1] = vCos; + M.r[2] = g_XMIdentityR2; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw +( + float Pitch, + float Yaw, + float Roll +) +{ + XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + return XMMatrixRotationRollPitchYawFromVector(Angles); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector +( + FXMVECTOR Angles // +) +{ + XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles); + return XMMatrixRotationQuaternion(Q); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationNormal +( + FXMVECTOR NormalAxis, + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMVECTOR A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f); + + XMVECTOR C2 = XMVectorSplatZ(A); + XMVECTOR C1 = XMVectorSplatY(A); + XMVECTOR C0 = XMVectorSplatX(A); + + XMVECTOR N0 = XMVectorSwizzle(NormalAxis); + XMVECTOR N1 = XMVectorSwizzle(NormalAxis); + + XMVECTOR V0 = XMVectorMultiply(C2, N0); + V0 = XMVectorMultiply(V0, N1); + + XMVECTOR R0 = XMVectorMultiply(C2, NormalAxis); + R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1); + + XMVECTOR R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0); + XMVECTOR R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0); + + V0 = XMVectorSelect(A, R0, g_XMSelect1110.v); + XMVECTOR V1 = XMVectorPermute(R1, R2); + XMVECTOR V2 = XMVectorPermute(R1, R2); + + XMMATRIX M; + M.r[0] = XMVectorPermute(V0, V1); + M.r[1] = XMVectorPermute(V0, V1); + M.r[2] = XMVectorPermute(V0, V2); + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMVECTOR C2 = _mm_set_ps1(1.0f - fCosAngle); + XMVECTOR C1 = _mm_set_ps1(fCosAngle); + XMVECTOR C0 = _mm_set_ps1(fSinAngle); + + XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,0,2,1)); + XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,1,0,2)); + + XMVECTOR V0 = _mm_mul_ps(C2, N0); + V0 = _mm_mul_ps(V0, N1); + + XMVECTOR R0 = _mm_mul_ps(C2, NormalAxis); + R0 = _mm_mul_ps(R0, NormalAxis); + R0 = _mm_add_ps(R0, C1); + + XMVECTOR R1 = _mm_mul_ps(C0, NormalAxis); + R1 = _mm_add_ps(R1, V0); + XMVECTOR R2 = _mm_mul_ps(C0, NormalAxis); + R2 = _mm_sub_ps(V0,R2); + + V0 = _mm_and_ps(R0,g_XMMask3); + XMVECTOR V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,1,2,0)); + V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,3,2,1)); + XMVECTOR V2 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(0,0,1,1)); + V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,2,0)); + + R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(1,0,3,0)); + R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,2,0)); + + XMMATRIX M; + M.r[0] = R2; + + R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(3,2,3,1)); + R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,0,2)); + M.r[1] = R2; + + V2 = _mm_shuffle_ps(V2,V0,_MM_SHUFFLE(3,2,1,0)); + M.r[2] = V2; + M.r[3] = g_XMIdentityR3.v; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationAxis +( + FXMVECTOR Axis, + float Angle +) +{ + assert(!XMVector3Equal(Axis, XMVectorZero())); + assert(!XMVector3IsInfinite(Axis)); + + XMVECTOR Normal = XMVector3Normalize(Axis); + return XMMatrixRotationNormal(Normal, Angle); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion +( + FXMVECTOR Quaternion +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f}; + + XMVECTOR Q0 = XMVectorAdd(Quaternion, Quaternion); + XMVECTOR Q1 = XMVectorMultiply(Quaternion, Q0); + + XMVECTOR V0 = XMVectorPermute(Q1, Constant1110.v); + XMVECTOR V1 = XMVectorPermute(Q1, Constant1110.v); + XMVECTOR R0 = XMVectorSubtract(Constant1110, V0); + R0 = XMVectorSubtract(R0, V1); + + V0 = XMVectorSwizzle(Quaternion); + V1 = XMVectorSwizzle(Q0); + V0 = XMVectorMultiply(V0, V1); + + V1 = XMVectorSplatW(Quaternion); + XMVECTOR V2 = XMVectorSwizzle(Q0); + V1 = XMVectorMultiply(V1, V2); + + XMVECTOR R1 = XMVectorAdd(V0, V1); + XMVECTOR R2 = XMVectorSubtract(V0, V1); + + V0 = XMVectorPermute(R1, R2); + V1 = XMVectorPermute(R1, R2); + + XMMATRIX M; + M.r[0] = XMVectorPermute(R0, V0); + M.r[1] = XMVectorPermute(R0, V0); + M.r[2] = XMVectorPermute(R0, V1); + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f}; + + XMVECTOR Q0 = _mm_add_ps(Quaternion,Quaternion); + XMVECTOR Q1 = _mm_mul_ps(Quaternion,Q0); + + XMVECTOR V0 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,0,0,1)); + V0 = _mm_and_ps(V0,g_XMMask3); + XMVECTOR V1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,1,2,2)); + V1 = _mm_and_ps(V1,g_XMMask3); + XMVECTOR R0 = _mm_sub_ps(Constant1110,V0); + R0 = _mm_sub_ps(R0, V1); + + V0 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,1,0,0)); + V1 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,2,1,2)); + V0 = _mm_mul_ps(V0, V1); + + V1 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,3,3,3)); + XMVECTOR V2 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,0,2,1)); + V1 = _mm_mul_ps(V1, V2); + + XMVECTOR R1 = _mm_add_ps(V0, V1); + XMVECTOR R2 = _mm_sub_ps(V0, V1); + + V0 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(1,0,2,1)); + V0 = XM_PERMUTE_PS(V0,_MM_SHUFFLE(1,3,2,0)); + V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,2,0,0)); + V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,0,2,0)); + + Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(1,0,3,0)); + Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,2,0)); + + XMMATRIX M; + M.r[0] = Q1; + + Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(3,2,3,1)); + Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,0,2)); + M.r[1] = Q1; + + Q1 = _mm_shuffle_ps(V1,R0,_MM_SHUFFLE(3,2,1,0)); + M.r[2] = Q1; + M.r[3] = g_XMIdentityR3; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTransformation2D +( + FXMVECTOR ScalingOrigin, + float ScalingOrientation, + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + float Rotation, + GXMVECTOR Translation +) +{ + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v); + XMVECTOR NegScalingOrigin = XMVectorNegate(VScalingOrigin); + + XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + XMMATRIX MScalingOrientation = XMMatrixRotationZ(ScalingOrientation); + XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + XMMATRIX MRotation = XMMatrixRotationZ(Rotation); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v); + + XMMATRIX M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixTransformation +( + FXMVECTOR ScalingOrigin, + FXMVECTOR ScalingOrientationQuaternion, + FXMVECTOR Scaling, + GXMVECTOR RotationOrigin, + HXMVECTOR RotationQuaternion, + HXMVECTOR Translation +) +{ + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v); + XMVECTOR NegScalingOrigin = XMVectorNegate(ScalingOrigin); + + XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + XMMATRIX MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion); + XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v); + XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v); + + XMMATRIX M; + M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + float Rotation, + FXMVECTOR Translation +) +{ + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + XMMATRIX MRotation = XMMatrixRotationZ(Rotation); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v); + + XMMATRIX M; + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + FXMVECTOR RotationQuaternion, + GXMVECTOR Translation +) +{ + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin,g_XMSelect1110.v); + XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation,g_XMSelect1110.v); + + XMMATRIX M; + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixReflect +( + FXMVECTOR ReflectionPlane +) +{ + assert(!XMVector3Equal(ReflectionPlane, XMVectorZero())); + assert(!XMPlaneIsInfinite(ReflectionPlane)); + + static const XMVECTORF32 NegativeTwo = {-2.0f, -2.0f, -2.0f, 0.0f}; + + XMVECTOR P = XMPlaneNormalize(ReflectionPlane); + XMVECTOR S = XMVectorMultiply(P, NegativeTwo); + + XMVECTOR A = XMVectorSplatX(P); + XMVECTOR B = XMVectorSplatY(P); + XMVECTOR C = XMVectorSplatZ(P); + XMVECTOR D = XMVectorSplatW(P); + + XMMATRIX M; + M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v); + M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v); + M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v); + M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixShadow +( + FXMVECTOR ShadowPlane, + FXMVECTOR LightPosition +) +{ + static const XMVECTORU32 Select0001 = {XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1}; + + assert(!XMVector3Equal(ShadowPlane, XMVectorZero())); + assert(!XMPlaneIsInfinite(ShadowPlane)); + + XMVECTOR P = XMPlaneNormalize(ShadowPlane); + XMVECTOR Dot = XMPlaneDot(P, LightPosition); + P = XMVectorNegate(P); + XMVECTOR D = XMVectorSplatW(P); + XMVECTOR C = XMVectorSplatZ(P); + XMVECTOR B = XMVectorSplatY(P); + XMVECTOR A = XMVectorSplatX(P); + Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v); + + XMMATRIX M; + M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot); + return M; +} + +//------------------------------------------------------------------------------ +// View and projection initialization operations +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookAtLH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) +{ + XMVECTOR EyeDirection = XMVectorSubtract(FocusPosition, EyePosition); + return XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookAtRH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) +{ + XMVECTOR NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition); + return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookToLH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) +{ + assert(!XMVector3Equal(EyeDirection, XMVectorZero())); + assert(!XMVector3IsInfinite(EyeDirection)); + assert(!XMVector3Equal(UpDirection, XMVectorZero())); + assert(!XMVector3IsInfinite(UpDirection)); + + XMVECTOR R2 = XMVector3Normalize(EyeDirection); + + XMVECTOR R0 = XMVector3Cross(UpDirection, R2); + R0 = XMVector3Normalize(R0); + + XMVECTOR R1 = XMVector3Cross(R2, R0); + + XMVECTOR NegEyePosition = XMVectorNegate(EyePosition); + + XMVECTOR D0 = XMVector3Dot(R0, NegEyePosition); + XMVECTOR D1 = XMVector3Dot(R1, NegEyePosition); + XMVECTOR D2 = XMVector3Dot(R2, NegEyePosition); + + XMMATRIX M; + M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v); + M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v); + M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v); + M.r[3] = g_XMIdentityR3.v; + + M = XMMatrixTranspose(M); + + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixLookToRH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) +{ + XMVECTOR NegEyeDirection = XMVectorNegate(EyeDirection); + return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +#pragma prefast(push) +#pragma prefast(disable:28931, "PREfast noise: Esp:1266") + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( TwoNearZ / ViewWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( TwoNearZ / ViewHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, g_XMIdentityR3.v, 2 ); + M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( TwoNearZ / ViewWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( TwoNearZ / ViewHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, g_XMNegIdentityR3.v, 2 ); + M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,-1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH +( + float FovAngleY, + float AspectRatio, + float NearZ, + float FarZ +) +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + float fRange = FarZ / (FarZ-NearZ); + + XMMATRIX M; + M.m[0][0] = Width; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Height; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float fRange = FarZ / (FarZ-NearZ); + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( Width, Zero, 0 ); + M.r[1] = vsetq_lane_f32( Height, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, g_XMIdentityR3.v, 2 ); + M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float fRange = FarZ / (FarZ-NearZ); + // Note: This is recorded on the stack + float Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectRatio, + Height, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // CosFov / SinFov,0,0,0 + XMMATRIX M; + M.r[0] = vTemp; + // 0,Height / AspectRatio,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH +( + float FovAngleY, + float AspectRatio, + float NearZ, + float FarZ +) +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + float fRange = FarZ / (NearZ-FarZ); + + XMMATRIX M; + M.m[0][0] = Width; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Height; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + float fRange = FarZ / (NearZ-FarZ); + float Height = CosFov / SinFov; + float Width = Height / AspectRatio; + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( Width, Zero, 0 ); + M.r[1] = vsetq_lane_f32( Height, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, g_XMNegIdentityR3.v, 2 ); + M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + float fRange = FarZ / (NearZ-FarZ); + // Note: This is recorded on the stack + float Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectRatio, + Height, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // CosFov / SinFov,0,0,0 + XMMATRIX M; + M.r[0] = vTemp; + // 0,Height / AspectRatio,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,-1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ-NearZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ-NearZ); + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( TwoNearZ * ReciprocalWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( TwoNearZ * ReciprocalHeight, Zero, 1 ); + M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f); + M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ+NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ-NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ*ReciprocalWidth, + TwoNearZ*ReciprocalHeight, + -fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet( -(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f ); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues,g_XMMaskZ); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) +{ + assert(NearZ > 0.f && FarZ > 0.f); + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ-FarZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ-FarZ); + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( TwoNearZ * ReciprocalWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( TwoNearZ * ReciprocalHeight, Zero, 1 ); + M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f); + M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ+NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ*ReciprocalWidth, + TwoNearZ*ReciprocalHeight, + fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet( (ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f ); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues,g_XMMaskZ); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float fRange = 1.0f / (FarZ-NearZ); + + XMMATRIX M; + M.m[0][0] = 2.0f / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 2.0f / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fRange = 1.0f / (FarZ-NearZ); + + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( 2.0f / ViewWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( 2.0f / ViewHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); + M.r[3] = vsetq_lane_f32( -fRange * NearZ, g_XMIdentityR3.v, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fRange = 1.0f / (FarZ-NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float fRange = 1.0f / (NearZ-FarZ); + + XMMATRIX M; + M.m[0][0] = 2.0f / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 2.0f / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fRange = 1.0f / (NearZ-FarZ); + + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( 2.0f / ViewWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( 2.0f / ViewHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); + M.r[3] = vsetq_lane_f32( fRange * NearZ, g_XMIdentityR3.v, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fRange = 1.0f / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0)); + M.r[3] = vTemp; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ-NearZ); + + XMMATRIX M; + M.m[0][0] = ReciprocalWidth + ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ReciprocalHeight + ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[3][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ-NearZ); + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( ReciprocalWidth + ReciprocalWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( ReciprocalHeight + ReciprocalHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + -fRange * NearZ, + 1.0f); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ-NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + -NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp,vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + vTemp = _mm_add_ps(vTemp,vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues,rMem2); + M.r[3] = vValues; + return M; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ-FarZ); + + XMMATRIX M; + M.m[0][0] = ReciprocalWidth + ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ReciprocalHeight + ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange * NearZ, + 1.0f); + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ-FarZ); + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( ReciprocalWidth + ReciprocalWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( ReciprocalHeight + ReciprocalHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange * NearZ, + 1.0f); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp,vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + vTemp = _mm_add_ps(vTemp,vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues,rMem2); + M.r[3] = vValues; + return M; +#endif +} + +#pragma prefast(pop) + +/**************************************************************************** + * + * XMMATRIX operators and methods + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMMATRIX::XMMATRIX +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) +{ + r[0] = XMVectorSet(m00, m01, m02, m03); + r[1] = XMVectorSet(m10, m11, m12, m13); + r[2] = XMVectorSet(m20, m21, m22, m23); + r[3] = XMVectorSet(m30, m31, m32, m33); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX::XMMATRIX +( + const float* pArray +) +{ + assert( pArray != nullptr ); + r[0] = XMLoadFloat4((const XMFLOAT4*)pArray); + r[1] = XMLoadFloat4((const XMFLOAT4*)(pArray + 4)); + r[2] = XMLoadFloat4((const XMFLOAT4*)(pArray + 8)); + r[3] = XMLoadFloat4((const XMFLOAT4*)(pArray + 12)); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator- () const +{ + XMMATRIX R; + R.r[0] = XMVectorNegate( r[0] ); + R.r[1] = XMVectorNegate( r[1] ); + R.r[2] = XMVectorNegate( r[2] ); + R.r[3] = XMVectorNegate( r[3] ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator+= (FXMMATRIX M) +{ + r[0] = XMVectorAdd( r[0], M.r[0] ); + r[1] = XMVectorAdd( r[1], M.r[1] ); + r[2] = XMVectorAdd( r[2], M.r[2] ); + r[3] = XMVectorAdd( r[3], M.r[3] ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator-= (FXMMATRIX M) +{ + r[0] = XMVectorSubtract( r[0], M.r[0] ); + r[1] = XMVectorSubtract( r[1], M.r[1] ); + r[2] = XMVectorSubtract( r[2], M.r[2] ); + r[3] = XMVectorSubtract( r[3], M.r[3] ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XM_CALLCONV XMMATRIX::operator*=(FXMMATRIX M) +{ + *this = XMMatrixMultiply( *this, M ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator*= (float S) +{ + r[0] = XMVectorScale( r[0], S ); + r[1] = XMVectorScale( r[1], S ); + r[2] = XMVectorScale( r[2], S ); + r[3] = XMVectorScale( r[3], S ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator/= (float S) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vS = XMVectorReplicate( S ); + r[0] = XMVectorDivide( r[0], vS ); + r[1] = XMVectorDivide( r[1], vS ); + r[2] = XMVectorDivide( r[2], vS ); + r[3] = XMVectorDivide( r[3], vS ); + return *this; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x2_t vS = vdup_n_f32( S ); + float32x2_t R0 = vrecpe_f32( vS ); + float32x2_t S0 = vrecps_f32( R0, vS ); + R0 = vmul_f32( S0, R0 ); + S0 = vrecps_f32( R0, vS ); + R0 = vmul_f32( S0, R0 ); + float32x4_t Reciprocal = vcombine_u32(R0, R0); + r[0] = vmulq_f32( r[0], Reciprocal ); + r[1] = vmulq_f32( r[1], Reciprocal ); + r[2] = vmulq_f32( r[2], Reciprocal ); + r[3] = vmulq_f32( r[3], Reciprocal ); + return *this; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 vS = _mm_set_ps1( S ); + r[0] = _mm_div_ps( r[0], vS ); + r[1] = _mm_div_ps( r[1], vS ); + r[2] = _mm_div_ps( r[2], vS ); + r[3] = _mm_div_ps( r[3], vS ); + return *this; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator+ (FXMMATRIX M) const +{ + XMMATRIX R; + R.r[0] = XMVectorAdd( r[0], M.r[0] ); + R.r[1] = XMVectorAdd( r[1], M.r[1] ); + R.r[2] = XMVectorAdd( r[2], M.r[2] ); + R.r[3] = XMVectorAdd( r[3], M.r[3] ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator- (FXMMATRIX M) const +{ + XMMATRIX R; + R.r[0] = XMVectorSubtract( r[0], M.r[0] ); + R.r[1] = XMVectorSubtract( r[1], M.r[1] ); + R.r[2] = XMVectorSubtract( r[2], M.r[2] ); + R.r[3] = XMVectorSubtract( r[3], M.r[3] ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV XMMATRIX::operator*(FXMMATRIX M) const +{ + return XMMatrixMultiply(*this, M); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator* (float S) const +{ + XMMATRIX R; + R.r[0] = XMVectorScale( r[0], S ); + R.r[1] = XMVectorScale( r[1], S ); + R.r[2] = XMVectorScale( r[2], S ); + R.r[3] = XMVectorScale( r[3], S ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator/ (float S) const +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vS = XMVectorReplicate( S ); + XMMATRIX R; + R.r[0] = XMVectorDivide( r[0], vS ); + R.r[1] = XMVectorDivide( r[1], vS ); + R.r[2] = XMVectorDivide( r[2], vS ); + R.r[3] = XMVectorDivide( r[3], vS ); + return R; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x2_t vS = vdup_n_f32( S ); + float32x2_t R0 = vrecpe_f32( vS ); + float32x2_t S0 = vrecps_f32( R0, vS ); + R0 = vmul_f32( S0, R0 ); + S0 = vrecps_f32( R0, vS ); + R0 = vmul_f32( S0, R0 ); + float32x4_t Reciprocal = vcombine_u32(R0, R0); + XMMATRIX R; + R.r[0] = vmulq_f32( r[0], Reciprocal ); + R.r[1] = vmulq_f32( r[1], Reciprocal ); + R.r[2] = vmulq_f32( r[2], Reciprocal ); + R.r[3] = vmulq_f32( r[3], Reciprocal ); + return R; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 vS = _mm_set_ps1( S ); + XMMATRIX R; + R.r[0] = _mm_div_ps( r[0], vS ); + R.r[1] = _mm_div_ps( r[1], vS ); + R.r[2] = _mm_div_ps( r[2], vS ); + R.r[3] = _mm_div_ps( r[3], vS ); + return R; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XM_CALLCONV operator* +( + float S, + FXMMATRIX M +) +{ + XMMATRIX R; + R.r[0] = XMVectorScale( M.r[0], S ); + R.r[1] = XMVectorScale( M.r[1], S ); + R.r[2] = XMVectorScale( M.r[2], S ); + R.r[3] = XMVectorScale( M.r[3], S ); + return R; +} + +/**************************************************************************** + * + * XMFLOAT3X3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT3X3::XMFLOAT3X3 +( + const float* pArray +) +{ + assert( pArray != nullptr ); + for (size_t Row = 0; Row < 3; Row++) + { + for (size_t Column = 0; Column < 3; Column++) + { + m[Row][Column] = pArray[Row * 3 + Column]; + } + } +} + +//------------------------------------------------------------------------------ + +inline XMFLOAT3X3& XMFLOAT3X3::operator= +( + const XMFLOAT3X3& Float3x3 +) +{ + _11 = Float3x3._11; + _12 = Float3x3._12; + _13 = Float3x3._13; + _21 = Float3x3._21; + _22 = Float3x3._22; + _23 = Float3x3._23; + _31 = Float3x3._31; + _32 = Float3x3._32; + _33 = Float3x3._33; + + return *this; +} + +/**************************************************************************** + * + * XMFLOAT4X3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4X3::XMFLOAT4X3 +( + const float* pArray +) +{ + assert( pArray != nullptr ); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + + m[1][0] = pArray[3]; + m[1][1] = pArray[4]; + m[1][2] = pArray[5]; + + m[2][0] = pArray[6]; + m[2][1] = pArray[7]; + m[2][2] = pArray[8]; + + m[3][0] = pArray[9]; + m[3][1] = pArray[10]; + m[3][2] = pArray[11]; +} + +//------------------------------------------------------------------------------ + +inline XMFLOAT4X3& XMFLOAT4X3::operator= +( + const XMFLOAT4X3& Float4x3 +) +{ + XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._11); + XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._22); + XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._33); + + XMStoreFloat4((XMFLOAT4*)&_11, V1); + XMStoreFloat4((XMFLOAT4*)&_22, V2); + XMStoreFloat4((XMFLOAT4*)&_33, V3); + + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMFLOAT4X3A& XMFLOAT4X3A::operator= +( + const XMFLOAT4X3A& Float4x3 +) +{ + XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._11); + XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._22); + XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._33); + + XMStoreFloat4A((XMFLOAT4A*)&_11, V1); + XMStoreFloat4A((XMFLOAT4A*)&_22, V2); + XMStoreFloat4A((XMFLOAT4A*)&_33, V3); + + return *this; +} + +/**************************************************************************** + * + * XMFLOAT4X4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4X4::XMFLOAT4X4 +( + const float* pArray +) +{ + assert( pArray != nullptr ); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + m[0][3] = pArray[3]; + + m[1][0] = pArray[4]; + m[1][1] = pArray[5]; + m[1][2] = pArray[6]; + m[1][3] = pArray[7]; + + m[2][0] = pArray[8]; + m[2][1] = pArray[9]; + m[2][2] = pArray[10]; + m[2][3] = pArray[11]; + + m[3][0] = pArray[12]; + m[3][1] = pArray[13]; + m[3][2] = pArray[14]; + m[3][3] = pArray[15]; +} + +//------------------------------------------------------------------------------ + +inline XMFLOAT4X4& XMFLOAT4X4::operator= +( + const XMFLOAT4X4& Float4x4 +) +{ + XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._11); + XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._21); + XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._31); + XMVECTOR V4 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._41); + + XMStoreFloat4((XMFLOAT4*)&_11, V1); + XMStoreFloat4((XMFLOAT4*)&_21, V2); + XMStoreFloat4((XMFLOAT4*)&_31, V3); + XMStoreFloat4((XMFLOAT4*)&_41, V4); + + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMFLOAT4X4A& XMFLOAT4X4A::operator= +( + const XMFLOAT4X4A& Float4x4 +) +{ + XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._11); + XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._21); + XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._31); + XMVECTOR V4 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._41); + + XMStoreFloat4A((XMFLOAT4A*)&_11, V1); + XMStoreFloat4A((XMFLOAT4A*)&_21, V2); + XMStoreFloat4A((XMFLOAT4A*)&_31, V3); + XMStoreFloat4A((XMFLOAT4A*)&_41, V4); + + return *this; +} + diff --git a/Inc/DirectXMathMisc.inl b/Inc/DirectXMathMisc.inl index ed6d423..69acff3 100644 --- a/Inc/DirectXMathMisc.inl +++ b/Inc/DirectXMathMisc.inl @@ -1,2512 +1,2512 @@ -//------------------------------------------------------------------------------------- -// DirectXMathMisc.inl -- SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#pragma once - -/**************************************************************************** - * - * Quaternion - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ -// Comparison operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMQuaternionEqual -( - FXMVECTOR Q1, - FXMVECTOR Q2 -) -{ - return XMVector4Equal(Q1, Q2); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMQuaternionNotEqual -( - FXMVECTOR Q1, - FXMVECTOR Q2 -) -{ - return XMVector4NotEqual(Q1, Q2); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMQuaternionIsNaN -( - FXMVECTOR Q -) -{ - return XMVector4IsNaN(Q); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMQuaternionIsInfinite -( - FXMVECTOR Q -) -{ - return XMVector4IsInfinite(Q); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMQuaternionIsIdentity -( - FXMVECTOR Q -) -{ - return XMVector4Equal(Q, g_XMIdentityR3.v); -} - -//------------------------------------------------------------------------------ -// Computation operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionDot -( - FXMVECTOR Q1, - FXMVECTOR Q2 -) -{ - return XMVector4Dot(Q1, Q2); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionMultiply -( - FXMVECTOR Q1, - FXMVECTOR Q2 -) -{ - // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2) - - // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y), - // (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x), - // (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w), - // (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ] - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result = { - (Q2.vector4_f32[3] * Q1.vector4_f32[0]) + (Q2.vector4_f32[0] * Q1.vector4_f32[3]) + (Q2.vector4_f32[1] * Q1.vector4_f32[2]) - (Q2.vector4_f32[2] * Q1.vector4_f32[1]), - (Q2.vector4_f32[3] * Q1.vector4_f32[1]) - (Q2.vector4_f32[0] * Q1.vector4_f32[2]) + (Q2.vector4_f32[1] * Q1.vector4_f32[3]) + (Q2.vector4_f32[2] * Q1.vector4_f32[0]), - (Q2.vector4_f32[3] * Q1.vector4_f32[2]) + (Q2.vector4_f32[0] * Q1.vector4_f32[1]) - (Q2.vector4_f32[1] * Q1.vector4_f32[0]) + (Q2.vector4_f32[2] * Q1.vector4_f32[3]), - (Q2.vector4_f32[3] * Q1.vector4_f32[3]) - (Q2.vector4_f32[0] * Q1.vector4_f32[0]) - (Q2.vector4_f32[1] * Q1.vector4_f32[1]) - (Q2.vector4_f32[2] * Q1.vector4_f32[2]) }; - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f}; - static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f}; - static const XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f}; - - float32x2_t Q2L = vget_low_f32(Q2); - float32x2_t Q2H = vget_high_f32(Q2); - - float32x4_t Q2X = vdupq_lane_f32( Q2L, 0 ); - float32x4_t Q2Y = vdupq_lane_f32( Q2L, 1 ); - float32x4_t Q2Z = vdupq_lane_f32( Q2H, 0 ); - XMVECTOR vResult = vmulq_lane_f32(Q1, Q2H, 1); - - // Mul by Q1WZYX - float32x4_t vTemp = vrev64q_f32(Q1); - vTemp = vcombine_f32( vget_high_f32(vTemp), vget_low_f32(vTemp) ); - Q2X = vmulq_f32(Q2X,vTemp); - vResult = vmlaq_f32( vResult, Q2X, ControlWZYX ); - - // Mul by Q1ZWXY - vTemp = vrev64q_u32(vTemp); - Q2Y = vmulq_f32(Q2Y,vTemp); - vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY); - - // Mul by Q1YXWZ - vTemp = vrev64q_u32(vTemp); - vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp)); - Q2Z = vmulq_f32(Q2Z,vTemp); - vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f}; - static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f}; - static const XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f}; - // Copy to SSE registers and use as few as possible for x86 - XMVECTOR Q2X = Q2; - XMVECTOR Q2Y = Q2; - XMVECTOR Q2Z = Q2; - XMVECTOR vResult = Q2; - // Splat with one instruction - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,3,3,3)); - Q2X = XM_PERMUTE_PS(Q2X,_MM_SHUFFLE(0,0,0,0)); - Q2Y = XM_PERMUTE_PS(Q2Y,_MM_SHUFFLE(1,1,1,1)); - Q2Z = XM_PERMUTE_PS(Q2Z,_MM_SHUFFLE(2,2,2,2)); - // Retire Q1 and perform Q1*Q2W - vResult = _mm_mul_ps(vResult,Q1); - XMVECTOR Q1Shuffle = Q1; - // Shuffle the copies of Q1 - Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3)); - // Mul by Q1WZYX - Q2X = _mm_mul_ps(Q2X,Q1Shuffle); - Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(2,3,0,1)); - // Flip the signs on y and z - Q2X = _mm_mul_ps(Q2X,ControlWZYX); - // Mul by Q1ZWXY - Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle); - Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3)); - // Flip the signs on z and w - Q2Y = _mm_mul_ps(Q2Y,ControlZWXY); - // Mul by Q1YXWZ - Q2Z = _mm_mul_ps(Q2Z,Q1Shuffle); - vResult = _mm_add_ps(vResult,Q2X); - // Flip the signs on x and w - Q2Z = _mm_mul_ps(Q2Z,ControlYXWZ); - Q2Y = _mm_add_ps(Q2Y,Q2Z); - vResult = _mm_add_ps(vResult,Q2Y); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionLengthSq -( - FXMVECTOR Q -) -{ - return XMVector4LengthSq(Q); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength -( - FXMVECTOR Q -) -{ - return XMVector4ReciprocalLength(Q); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionLength -( - FXMVECTOR Q -) -{ - return XMVector4Length(Q); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst -( - FXMVECTOR Q -) -{ - return XMVector4NormalizeEst(Q); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionNormalize -( - FXMVECTOR Q -) -{ - return XMVector4Normalize(Q); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionConjugate -( - FXMVECTOR Q -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result = { - -Q.vector4_f32[0], - -Q.vector4_f32[1], - -Q.vector4_f32[2], - Q.vector4_f32[3] - }; - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f}; - return vmulq_f32(Q, NegativeOne3.v ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f}; - return _mm_mul_ps(Q,NegativeOne3); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionInverse -( - FXMVECTOR Q -) -{ - const XMVECTOR Zero = XMVectorZero(); - - XMVECTOR L = XMVector4LengthSq(Q); - XMVECTOR Conjugate = XMQuaternionConjugate(Q); - - XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v); - - XMVECTOR Result = XMVectorDivide(Conjugate, L); - - Result = XMVectorSelect(Result, Zero, Control); - - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionLn -( - FXMVECTOR Q -) -{ - static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; - - XMVECTOR QW = XMVectorSplatW(Q); - XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v); - - XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v); - - XMVECTOR Theta = XMVectorACos(QW); - XMVECTOR SinTheta = XMVectorSin(Theta); - - XMVECTOR S = XMVectorDivide(Theta,SinTheta); - - XMVECTOR Result = XMVectorMultiply(Q0, S); - Result = XMVectorSelect(Q0, Result, ControlW); - - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionExp -( - FXMVECTOR Q -) -{ - XMVECTOR Theta = XMVector3Length(Q); - - XMVECTOR SinTheta, CosTheta; - XMVectorSinCos(&SinTheta, &CosTheta, Theta); - - XMVECTOR S = XMVectorDivide(SinTheta, Theta); - - XMVECTOR Result = XMVectorMultiply(Q, S); - - const XMVECTOR Zero = XMVectorZero(); - XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v); - Result = XMVectorSelect(Result, Q, Control); - - Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v); - - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionSlerp -( - FXMVECTOR Q0, - FXMVECTOR Q1, - float t -) -{ - XMVECTOR T = XMVectorReplicate(t); - return XMQuaternionSlerpV(Q0, Q1, T); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionSlerpV -( - FXMVECTOR Q0, - FXMVECTOR Q1, - FXMVECTOR T -) -{ - assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T))); - - // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega) - -#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) - - const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; - - XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); - - const XMVECTOR Zero = XMVectorZero(); - XMVECTOR Control = XMVectorLess(CosOmega, Zero); - XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control); - - CosOmega = XMVectorMultiply(CosOmega, Sign); - - Control = XMVectorLess(CosOmega, OneMinusEpsilon); - - XMVECTOR SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v); - SinOmega = XMVectorSqrt(SinOmega); - - XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); - - XMVECTOR SignMask = XMVectorSplatSignMask(); - XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2); - SignMask = XMVectorShiftLeft(SignMask, Zero, 3); - V01 = XMVectorXorInt(V01, SignMask); - V01 = XMVectorAdd(g_XMIdentityR0.v, V01); - - XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega); - - XMVECTOR S0 = XMVectorMultiply(V01, Omega); - S0 = XMVectorSin(S0); - S0 = XMVectorMultiply(S0, InvSinOmega); - - S0 = XMVectorSelect(V01, S0, Control); - - XMVECTOR S1 = XMVectorSplatY(S0); - S0 = XMVectorSplatX(S0); - - S1 = XMVectorMultiply(S1, Sign); - - XMVECTOR Result = XMVectorMultiply(Q0, S0); - Result = XMVectorMultiplyAdd(Q1, S1, Result); - - return Result; - -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; - static const XMVECTORU32 SignMask2 = {0x80000000,0x00000000,0x00000000,0x00000000}; - - XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); - - const XMVECTOR Zero = XMVectorZero(); - XMVECTOR Control = XMVectorLess(CosOmega, Zero); - XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control); - - CosOmega = _mm_mul_ps(CosOmega, Sign); - - Control = XMVectorLess(CosOmega, OneMinusEpsilon); - - XMVECTOR SinOmega = _mm_mul_ps(CosOmega,CosOmega); - SinOmega = _mm_sub_ps(g_XMOne,SinOmega); - SinOmega = _mm_sqrt_ps(SinOmega); - - XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); - - XMVECTOR V01 = XM_PERMUTE_PS(T,_MM_SHUFFLE(2,3,0,1)); - V01 = _mm_and_ps(V01,g_XMMaskXY); - V01 = _mm_xor_ps(V01,SignMask2); - V01 = _mm_add_ps(g_XMIdentityR0, V01); - - XMVECTOR S0 = _mm_mul_ps(V01, Omega); - S0 = XMVectorSin(S0); - S0 = _mm_div_ps(S0, SinOmega); - - S0 = XMVectorSelect(V01, S0, Control); - - XMVECTOR S1 = XMVectorSplatY(S0); - S0 = XMVectorSplatX(S0); - - S1 = _mm_mul_ps(S1, Sign); - XMVECTOR Result = _mm_mul_ps(Q0, S0); - S1 = _mm_mul_ps(S1, Q1); - Result = _mm_add_ps(Result,S1); - return Result; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionSquad -( - FXMVECTOR Q0, - FXMVECTOR Q1, - FXMVECTOR Q2, - GXMVECTOR Q3, - float t -) -{ - XMVECTOR T = XMVectorReplicate(t); - return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionSquadV -( - FXMVECTOR Q0, - FXMVECTOR Q1, - FXMVECTOR Q2, - GXMVECTOR Q3, - HXMVECTOR T -) -{ - assert( (XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)) ); - - XMVECTOR TP = T; - const XMVECTOR Two = XMVectorSplatConstant(2, 0); - - XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T); - XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T); - - TP = XMVectorNegativeMultiplySubtract(TP, TP, TP); - TP = XMVectorMultiply(TP, Two); - - XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP); - - return Result; -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMQuaternionSquadSetup -( - XMVECTOR* pA, - XMVECTOR* pB, - XMVECTOR* pC, - FXMVECTOR Q0, - FXMVECTOR Q1, - FXMVECTOR Q2, - GXMVECTOR Q3 -) -{ - assert(pA); - assert(pB); - assert(pC); - - XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2)); - XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2)); - XMVECTOR SQ2 = XMVectorNegate(Q2); - - XMVECTOR Control1 = XMVectorLess(LS12, LD12); - SQ2 = XMVectorSelect(Q2, SQ2, Control1); - - XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1)); - XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1)); - XMVECTOR SQ0 = XMVectorNegate(Q0); - - XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3)); - XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3)); - XMVECTOR SQ3 = XMVectorNegate(Q3); - - XMVECTOR Control0 = XMVectorLess(LS01, LD01); - XMVECTOR Control2 = XMVectorLess(LS23, LD23); - - SQ0 = XMVectorSelect(Q0, SQ0, Control0); - SQ3 = XMVectorSelect(Q3, SQ3, Control2); - - XMVECTOR InvQ1 = XMQuaternionInverse(Q1); - XMVECTOR InvQ2 = XMQuaternionInverse(SQ2); - - XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0)); - XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2)); - XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1)); - XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3)); - - const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2); - - XMVECTOR ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter); - XMVECTOR ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter); - ExpQ02 = XMQuaternionExp(ExpQ02); - ExpQ13 = XMQuaternionExp(ExpQ13); - - *pA = XMQuaternionMultiply(Q1, ExpQ02); - *pB = XMQuaternionMultiply(SQ2, ExpQ13); - *pC = SQ2; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentric -( - FXMVECTOR Q0, - FXMVECTOR Q1, - FXMVECTOR Q2, - float f, - float g -) -{ - float s = f + g; - - XMVECTOR Result; - if ((s < 0.00001f) && (s > -0.00001f)) - { - Result = Q0; - } - else - { - XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s); - XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s); - - Result = XMQuaternionSlerp(Q01, Q02, g / s); - } - - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV -( - FXMVECTOR Q0, - FXMVECTOR Q1, - FXMVECTOR Q2, - GXMVECTOR F, - HXMVECTOR G -) -{ - assert( (XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)) ); - assert( (XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)) ); - - const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16); - - XMVECTOR S = XMVectorAdd(F, G); - - XMVECTOR Result; - if (XMVector4InBounds(S, Epsilon)) - { - Result = Q0; - } - else - { - XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S); - XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S); - XMVECTOR GS = XMVectorReciprocal(S); - GS = XMVectorMultiply(G, GS); - - Result = XMQuaternionSlerpV(Q01, Q02, GS); - } - - return Result; -} - -//------------------------------------------------------------------------------ -// Transformation operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionIdentity() -{ - return g_XMIdentityR3.v; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw -( - float Pitch, - float Yaw, - float Roll -) -{ - XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); - XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles); - return Q; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector -( - FXMVECTOR Angles // -) -{ - static const XMVECTORF32 Sign = {1.0f, -1.0f, -1.0f, 1.0f}; - - XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v); - - XMVECTOR SinAngles, CosAngles; - XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles); - - XMVECTOR P0 = XMVectorPermute(SinAngles, CosAngles); - XMVECTOR Y0 = XMVectorPermute(SinAngles, CosAngles); - XMVECTOR R0 = XMVectorPermute(SinAngles, CosAngles); - XMVECTOR P1 = XMVectorPermute(CosAngles, SinAngles); - XMVECTOR Y1 = XMVectorPermute(CosAngles, SinAngles); - XMVECTOR R1 = XMVectorPermute(CosAngles, SinAngles); - - XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v); - XMVECTOR Q0 = XMVectorMultiply(P0, Y0); - Q1 = XMVectorMultiply(Q1, Y1); - Q0 = XMVectorMultiply(Q0, R0); - XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0); - - return Q; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionRotationNormal -( - FXMVECTOR NormalAxis, - float Angle -) -{ -#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) - - XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v); - - float SinV, CosV; - XMScalarSinCos(&SinV, &CosV, 0.5f * Angle); - - XMVECTOR Scale = XMVectorSet( SinV, SinV, SinV, CosV ); - return XMVectorMultiply(N, Scale); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR N = _mm_and_ps(NormalAxis,g_XMMask3); - N = _mm_or_ps(N,g_XMIdentityR3); - XMVECTOR Scale = _mm_set_ps1(0.5f * Angle); - XMVECTOR vSine; - XMVECTOR vCosine; - XMVectorSinCos(&vSine,&vCosine,Scale); - Scale = _mm_and_ps(vSine,g_XMMask3); - vCosine = _mm_and_ps(vCosine,g_XMMaskW); - Scale = _mm_or_ps(Scale,vCosine); - N = _mm_mul_ps(N,Scale); - return N; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionRotationAxis -( - FXMVECTOR Axis, - float Angle -) -{ - assert(!XMVector3Equal(Axis, XMVectorZero())); - assert(!XMVector3IsInfinite(Axis)); - - XMVECTOR Normal = XMVector3Normalize(Axis); - XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle); - return Q; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix -( - FXMMATRIX M -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTORF32 q; - float r22 = M.m[2][2]; - if (r22 <= 0.f) // x^2 + y^2 >= z^2 + w^2 - { - float dif10 = M.m[1][1] - M.m[0][0]; - float omr22 = 1.f - r22; - if (dif10 <= 0.f) // x^2 >= y^2 - { - float fourXSqr = omr22 - dif10; - float inv4x = 0.5f / sqrtf(fourXSqr); - q.f[0] = fourXSqr*inv4x; - q.f[1] = (M.m[0][1] + M.m[1][0])*inv4x; - q.f[2] = (M.m[0][2] + M.m[2][0])*inv4x; - q.f[3] = (M.m[1][2] - M.m[2][1])*inv4x; - } - else // y^2 >= x^2 - { - float fourYSqr = omr22 + dif10; - float inv4y = 0.5f / sqrtf(fourYSqr); - q.f[0] = (M.m[0][1] + M.m[1][0])*inv4y; - q.f[1] = fourYSqr*inv4y; - q.f[2] = (M.m[1][2] + M.m[2][1])*inv4y; - q.f[3] = (M.m[2][0] - M.m[0][2])*inv4y; - } - } - else // z^2 + w^2 >= x^2 + y^2 - { - float sum10 = M.m[1][1] + M.m[0][0]; - float opr22 = 1.f + r22; - if (sum10 <= 0.f) // z^2 >= w^2 - { - float fourZSqr = opr22 - sum10; - float inv4z = 0.5f / sqrtf(fourZSqr); - q.f[0] = (M.m[0][2] + M.m[2][0])*inv4z; - q.f[1] = (M.m[1][2] + M.m[2][1])*inv4z; - q.f[2] = fourZSqr*inv4z; - q.f[3] = (M.m[0][1] - M.m[1][0])*inv4z; - } - else // w^2 >= z^2 - { - float fourWSqr = opr22 + sum10; - float inv4w = 0.5f / sqrtf(fourWSqr); - q.f[0] = (M.m[1][2] - M.m[2][1])*inv4w; - q.f[1] = (M.m[2][0] - M.m[0][2])*inv4w; - q.f[2] = (M.m[0][1] - M.m[1][0])*inv4w; - q.f[3] = fourWSqr*inv4w; - } - } - return q.v; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 XMPMMP = {+1.0f, -1.0f, -1.0f, +1.0f}; - static const XMVECTORF32 XMMPMP = {-1.0f, +1.0f, -1.0f, +1.0f}; - static const XMVECTORF32 XMMMPP = {-1.0f, -1.0f, +1.0f, +1.0f}; - static const XMVECTORU32 Select0110 = { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 }; - static const XMVECTORU32 Select0010 = { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 }; - - XMVECTOR r0 = M.r[0]; - XMVECTOR r1 = M.r[1]; - XMVECTOR r2 = M.r[2]; - - XMVECTOR r00 = vdupq_lane_f32(vget_low_f32(r0), 0); - XMVECTOR r11 = vdupq_lane_f32(vget_low_f32(r1), 1); - XMVECTOR r22 = vdupq_lane_f32(vget_high_f32(r2), 0); - - // x^2 >= y^2 equivalent to r11 - r00 <= 0 - XMVECTOR r11mr00 = vsubq_f32(r11, r00); - XMVECTOR x2gey2 = vcleq_f32(r11mr00, g_XMZero); - - // z^2 >= w^2 equivalent to r11 + r00 <= 0 - XMVECTOR r11pr00 = vaddq_f32(r11, r00); - XMVECTOR z2gew2 = vcleq_f32(r11pr00, g_XMZero); - - // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 - XMVECTOR x2py2gez2pw2 = vcleq_f32(r22, g_XMZero); - - // (4*x^2, 4*y^2, 4*z^2, 4*w^2) - XMVECTOR t0 = vmulq_f32( XMPMMP, r00 ); - XMVECTOR x2y2z2w2 = vmlaq_f32( t0, XMMPMP, r11 ); - x2y2z2w2 = vmlaq_f32( x2y2z2w2, XMMMPP, r22 ); - x2y2z2w2 = vaddq_f32( x2y2z2w2, g_XMOne ); - - // (r01, r02, r12, r11) - t0 = vextq_f32(r0, r0, 1); - XMVECTOR t1 = vextq_f32(r1, r1, 1); - t0 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_low_f32( t1 ) ) ); - - // (r10, r20, r21, r10) - t1 = vextq_f32(r2, r2, 3); - XMVECTOR r10 = vdupq_lane_f32( vget_low_f32(r1), 0 ); - t1 = vbslq_f32( Select0110, t1, r10 ); - - // (4*x*y, 4*x*z, 4*y*z, unused) - XMVECTOR xyxzyz = vaddq_f32(t0, t1); - - // (r21, r20, r10, r10) - t0 = vcombine_f32( vrev64_f32( vget_low_f32(r2) ), vget_low_f32(r10) ); - - // (r12, r02, r01, r12) - XMVECTOR t2 = vcombine_f32( vrev64_f32( vget_high_f32(r0) ), vrev64_f32( vget_low_f32(r0) ) ); - XMVECTOR t3 = vdupq_lane_f32( vget_high_f32(r1), 0 ); - t1 = vbslq_f32( Select0110, t2, t3 ); - - // (4*x*w, 4*y*w, 4*z*w, unused) - XMVECTOR xwywzw = vsubq_f32(t0, t1); - xwywzw = vmulq_f32(XMMPMP, xwywzw); - - // (4*x*x, 4*x*y, 4*x*z, 4*x*w) - t0 = vextq_f32( xyxzyz, xyxzyz, 3 ); - t1 = vbslq_f32( Select0110, t0, x2y2z2w2 ); - t2 = vdupq_lane_f32( vget_low_f32(xwywzw), 0 ); - XMVECTOR tensor0 = vbslq_f32( g_XMSelect1110, t1, t2 ); - - // (4*y*x, 4*y*y, 4*y*z, 4*y*w) - t0 = vbslq_f32( g_XMSelect1011, xyxzyz, x2y2z2w2 ); - t1 = vdupq_lane_f32( vget_low_f32(xwywzw), 1 ); - XMVECTOR tensor1 = vbslq_f32( g_XMSelect1110, t0, t1 ); - - // (4*z*x, 4*z*y, 4*z*z, 4*z*w) - t0 = vextq_f32(xyxzyz, xyxzyz, 1); - t1 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_high_f32(xwywzw) ) ); - XMVECTOR tensor2 = vbslq_f32( Select0010, x2y2z2w2, t1 ); - - // (4*w*x, 4*w*y, 4*w*z, 4*w*w) - XMVECTOR tensor3 = vbslq_f32( g_XMSelect1110, xwywzw, x2y2z2w2 ); - - // Select the row of the tensor-product matrix that has the largest - // magnitude. - t0 = vbslq_f32( x2gey2, tensor0, tensor1 ); - t1 = vbslq_f32( z2gew2, tensor2, tensor3 ); - t2 = vbslq_f32( x2py2gez2pw2, t0, t1 ); - - // Normalize the row. No division by zero is possible because the - // quaternion is unit-length (and the row is a nonzero multiple of - // the quaternion). - t0 = XMVector4Length(t2); - return XMVectorDivide(t2, t0); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 XMPMMP = {+1.0f, -1.0f, -1.0f, +1.0f}; - static const XMVECTORF32 XMMPMP = {-1.0f, +1.0f, -1.0f, +1.0f}; - static const XMVECTORF32 XMMMPP = {-1.0f, -1.0f, +1.0f, +1.0f}; - - XMVECTOR r0 = M.r[0]; // (r00, r01, r02, 0) - XMVECTOR r1 = M.r[1]; // (r10, r11, r12, 0) - XMVECTOR r2 = M.r[2]; // (r20, r21, r22, 0) - - // (r00, r00, r00, r00) - XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0,0,0,0)); - // (r11, r11, r11, r11) - XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1,1,1,1)); - // (r22, r22, r22, r22) - XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2,2,2,2)); - - // x^2 >= y^2 equivalent to r11 - r00 <= 0 - // (r11 - r00, r11 - r00, r11 - r00, r11 - r00) - XMVECTOR r11mr00 = _mm_sub_ps(r11, r00); - XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero); - - // z^2 >= w^2 equivalent to r11 + r00 <= 0 - // (r11 + r00, r11 + r00, r11 + r00, r11 + r00) - XMVECTOR r11pr00 = _mm_add_ps(r11, r00); - XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero); - - // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 - XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero); - - // (+r00, -r00, -r00, +r00) - XMVECTOR t0 = _mm_mul_ps(XMPMMP, r00); - - // (-r11, +r11, -r11, +r11) - XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11); - - // (-r22, -r22, +r22, +r22) - XMVECTOR t2 = _mm_mul_ps(XMMMPP, r22); - - // (4*x^2, 4*y^2, 4*z^2, 4*w^2) - XMVECTOR x2y2z2w2 = _mm_add_ps(t0, t1); - x2y2z2w2 = _mm_add_ps(t2, x2y2z2w2); - x2y2z2w2 = _mm_add_ps(x2y2z2w2, g_XMOne); - - // (r01, r02, r12, r11) - t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1,2,2,1)); - // (r10, r10, r20, r21) - t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1,0,0,0)); - // (r10, r20, r21, r10) - t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0)); - // (4*x*y, 4*x*z, 4*y*z, unused) - XMVECTOR xyxzyz = _mm_add_ps(t0, t1); - - // (r21, r20, r10, r10) - t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0,0,0,1)); - // (r12, r12, r02, r01) - t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1,2,2,2)); - // (r12, r02, r01, r12) - t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0)); - // (4*x*w, 4*y*w, 4*z*w, unused) - XMVECTOR xwywzw = _mm_sub_ps(t0, t1); - xwywzw = _mm_mul_ps(XMMPMP, xwywzw); - - // (4*x^2, 4*y^2, 4*x*y, unused) - t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0,0,1,0)); - // (4*z^2, 4*w^2, 4*z*w, unused) - t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0,2,3,2)); - // (4*x*z, 4*y*z, 4*x*w, 4*y*w) - t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1,0,2,1)); - - // (4*x*x, 4*x*y, 4*x*z, 4*x*w) - XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,2,0)); - // (4*y*x, 4*y*y, 4*y*z, 4*y*w) - XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,1,1,2)); - // (4*z*x, 4*z*y, 4*z*z, 4*z*w) - XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,0,1,0)); - // (4*w*x, 4*w*y, 4*w*z, 4*w*w) - XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1,2,3,2)); - - // Select the row of the tensor-product matrix that has the largest - // magnitude. - t0 = _mm_and_ps(x2gey2, tensor0); - t1 = _mm_andnot_ps(x2gey2, tensor1); - t0 = _mm_or_ps(t0, t1); - t1 = _mm_and_ps(z2gew2, tensor2); - t2 = _mm_andnot_ps(z2gew2, tensor3); - t1 = _mm_or_ps(t1, t2); - t0 = _mm_and_ps(x2py2gez2pw2, t0); - t1 = _mm_andnot_ps(x2py2gez2pw2, t1); - t2 = _mm_or_ps(t0, t1); - - // Normalize the row. No division by zero is possible because the - // quaternion is unit-length (and the row is a nonzero multiple of - // the quaternion). - t0 = XMVector4Length(t2); - return _mm_div_ps(t2, t0); -#endif -} - -//------------------------------------------------------------------------------ -// Conversion operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMQuaternionToAxisAngle -( - XMVECTOR* pAxis, - float* pAngle, - FXMVECTOR Q -) -{ - assert(pAxis); - assert(pAngle); - - *pAxis = Q; - - *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q)); -} - -/**************************************************************************** - * - * Plane - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ -// Comparison operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMPlaneEqual -( - FXMVECTOR P1, - FXMVECTOR P2 -) -{ - return XMVector4Equal(P1, P2); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMPlaneNearEqual -( - FXMVECTOR P1, - FXMVECTOR P2, - FXMVECTOR Epsilon -) -{ - XMVECTOR NP1 = XMPlaneNormalize(P1); - XMVECTOR NP2 = XMPlaneNormalize(P2); - return XMVector4NearEqual(NP1, NP2, Epsilon); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMPlaneNotEqual -( - FXMVECTOR P1, - FXMVECTOR P2 -) -{ - return XMVector4NotEqual(P1, P2); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMPlaneIsNaN -( - FXMVECTOR P -) -{ - return XMVector4IsNaN(P); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMPlaneIsInfinite -( - FXMVECTOR P -) -{ - return XMVector4IsInfinite(P); -} - -//------------------------------------------------------------------------------ -// Computation operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMPlaneDot -( - FXMVECTOR P, - FXMVECTOR V -) -{ - return XMVector4Dot(P, V); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMPlaneDotCoord -( - FXMVECTOR P, - FXMVECTOR V -) -{ - // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3] - - XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v); - XMVECTOR Result = XMVector4Dot(P, V3); - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMPlaneDotNormal -( - FXMVECTOR P, - FXMVECTOR V -) -{ - return XMVector3Dot(P, V); -} - -//------------------------------------------------------------------------------ -// XMPlaneNormalizeEst uses a reciprocal estimate and -// returns QNaN on zero and infinite vectors. - -inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst -( - FXMVECTOR P -) -{ -#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) - - XMVECTOR Result = XMVector3ReciprocalLengthEst(P); - return XMVectorMultiply(P, Result); - -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f ); - XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); - return _mm_mul_ps(vResult, P); -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product - XMVECTOR vDot = _mm_mul_ps(P,P); - // x=Dot.y, y=Dot.z - XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); - // Result.x = x+y - vDot = _mm_add_ss(vDot,vTemp); - // x=Dot.z - vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); - // Result.x = (x+y)+z - vDot = _mm_add_ss(vDot,vTemp); - // Splat x - vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); - // Get the reciprocal - vDot = _mm_rsqrt_ps(vDot); - // Get the reciprocal - vDot = _mm_mul_ps(vDot,P); - return vDot; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMPlaneNormalize -( - FXMVECTOR P -) -{ -#if defined(_XM_NO_INTRINSICS_) - float fLengthSq = sqrtf((P.vector4_f32[0]*P.vector4_f32[0])+(P.vector4_f32[1]*P.vector4_f32[1])+(P.vector4_f32[2]*P.vector4_f32[2])); - // Prevent divide by zero - if (fLengthSq) { - fLengthSq = 1.0f/fLengthSq; - } - { - XMVECTOR vResult = { - P.vector4_f32[0]*fLengthSq, - P.vector4_f32[1]*fLengthSq, - P.vector4_f32[2]*fLengthSq, - P.vector4_f32[3]*fLengthSq - }; - return vResult; - } -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR vLength = XMVector3ReciprocalLength(P); - return XMVectorMultiply( P, vLength ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f ); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Reciprocal mul to perform the normalization - vResult = _mm_div_ps(P,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vLengthSq); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x,y and z only - XMVECTOR vLengthSq = _mm_mul_ps(P,P); - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1)); - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Reciprocal mul to perform the normalization - vResult = _mm_div_ps(P,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vLengthSq); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMPlaneIntersectLine -( - FXMVECTOR P, - FXMVECTOR LinePoint1, - FXMVECTOR LinePoint2 -) -{ - XMVECTOR V1 = XMVector3Dot(P, LinePoint1); - XMVECTOR V2 = XMVector3Dot(P, LinePoint2); - XMVECTOR D = XMVectorSubtract(V1, V2); - - XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1); - VT = XMVectorDivide(VT, D); - - XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1); - Point = XMVectorMultiplyAdd(Point, VT, LinePoint1); - - const XMVECTOR Zero = XMVectorZero(); - XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v); - - return XMVectorSelect(Point, g_XMQNaN.v, Control); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV XMPlaneIntersectPlane -( - XMVECTOR* pLinePoint1, - XMVECTOR* pLinePoint2, - FXMVECTOR P1, - FXMVECTOR P2 -) -{ - assert(pLinePoint1); - assert(pLinePoint2); - - XMVECTOR V1 = XMVector3Cross(P2, P1); - - XMVECTOR LengthSq = XMVector3LengthSq(V1); - - XMVECTOR V2 = XMVector3Cross(P2, V1); - - XMVECTOR P1W = XMVectorSplatW(P1); - XMVECTOR Point = XMVectorMultiply(V2, P1W); - - XMVECTOR V3 = XMVector3Cross(V1, P1); - - XMVECTOR P2W = XMVectorSplatW(P2); - Point = XMVectorMultiplyAdd(V3, P2W, Point); - - XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq); - - XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1); - - XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v); - *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN.v, Control); - *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN.v, Control); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMPlaneTransform -( - FXMVECTOR P, - FXMMATRIX M -) -{ - XMVECTOR W = XMVectorSplatW(P); - XMVECTOR Z = XMVectorSplatZ(P); - XMVECTOR Y = XMVectorSplatY(P); - XMVECTOR X = XMVectorSplatX(P); - - XMVECTOR Result = XMVectorMultiply(W, M.r[3]); - Result = XMVectorMultiplyAdd(Z, M.r[2], Result); - Result = XMVectorMultiplyAdd(Y, M.r[1], Result); - Result = XMVectorMultiplyAdd(X, M.r[0], Result); - return Result; -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMFLOAT4* XM_CALLCONV XMPlaneTransformStream -( - XMFLOAT4* pOutputStream, - size_t OutputStride, - const XMFLOAT4* pInputStream, - size_t InputStride, - size_t PlaneCount, - FXMMATRIX M -) -{ - return XMVector4TransformStream(pOutputStream, - OutputStride, - pInputStream, - InputStride, - PlaneCount, - M); -} - -//------------------------------------------------------------------------------ -// Conversion operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMPlaneFromPointNormal -( - FXMVECTOR Point, - FXMVECTOR Normal -) -{ - XMVECTOR W = XMVector3Dot(Point, Normal); - W = XMVectorNegate(W); - return XMVectorSelect(W, Normal, g_XMSelect1110.v); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMPlaneFromPoints -( - FXMVECTOR Point1, - FXMVECTOR Point2, - FXMVECTOR Point3 -) -{ - XMVECTOR V21 = XMVectorSubtract(Point1, Point2); - XMVECTOR V31 = XMVectorSubtract(Point1, Point3); - - XMVECTOR N = XMVector3Cross(V21, V31); - N = XMVector3Normalize(N); - - XMVECTOR D = XMPlaneDotNormal(N, Point1); - D = XMVectorNegate(D); - - XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v); - - return Result; -} - -/**************************************************************************** - * - * Color - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ -// Comparison operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMColorEqual -( - FXMVECTOR C1, - FXMVECTOR C2 -) -{ - return XMVector4Equal(C1, C2); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMColorNotEqual -( - FXMVECTOR C1, - FXMVECTOR C2 -) -{ - return XMVector4NotEqual(C1, C2); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMColorGreater -( - FXMVECTOR C1, - FXMVECTOR C2 -) -{ - return XMVector4Greater(C1, C2); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMColorGreaterOrEqual -( - FXMVECTOR C1, - FXMVECTOR C2 -) -{ - return XMVector4GreaterOrEqual(C1, C2); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMColorLess -( - FXMVECTOR C1, - FXMVECTOR C2 -) -{ - return XMVector4Less(C1, C2); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMColorLessOrEqual -( - FXMVECTOR C1, - FXMVECTOR C2 -) -{ - return XMVector4LessOrEqual(C1, C2); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMColorIsNaN -( - FXMVECTOR C -) -{ - return XMVector4IsNaN(C); -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMColorIsInfinite -( - FXMVECTOR C -) -{ - return XMVector4IsInfinite(C); -} - -//------------------------------------------------------------------------------ -// Computation operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorNegative -( - FXMVECTOR vColor -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - 1.0f - vColor.vector4_f32[0], - 1.0f - vColor.vector4_f32[1], - 1.0f - vColor.vector4_f32[2], - vColor.vector4_f32[3] - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR vTemp = veorq_u32(vColor,g_XMNegate3); - return vaddq_f32(vTemp,g_XMOne3); -#elif defined(_XM_SSE_INTRINSICS_) - // Negate only x,y and z. - XMVECTOR vTemp = _mm_xor_ps(vColor,g_XMNegate3); - // Add 1,1,1,0 to -x,-y,-z,w - return _mm_add_ps(vTemp,g_XMOne3); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorModulate -( - FXMVECTOR C1, - FXMVECTOR C2 -) -{ - return XMVectorMultiply(C1, C2); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorAdjustSaturation -( - FXMVECTOR vColor, - float fSaturation -) -{ - // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2]; - // Result = (C - Luminance) * Saturation + Luminance; - -#if defined(_XM_NO_INTRINSICS_) - const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f}; - - float fLuminance = (vColor.vector4_f32[0]*gvLuminance.f[0])+(vColor.vector4_f32[1]*gvLuminance.f[1])+(vColor.vector4_f32[2]*gvLuminance.f[2]); - XMVECTOR vResult; - vResult.vector4_f32[0] = ((vColor.vector4_f32[0] - fLuminance)*fSaturation)+fLuminance; - vResult.vector4_f32[1] = ((vColor.vector4_f32[1] - fLuminance)*fSaturation)+fLuminance; - vResult.vector4_f32[2] = ((vColor.vector4_f32[2] - fLuminance)*fSaturation)+fLuminance; - vResult.vector4_f32[3] = vColor.vector4_f32[3]; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f}; - XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance ); - XMVECTOR vResult = vsubq_f32(vColor, vLuminance); - vResult = vmlaq_n_f32( vLuminance, vResult, fSaturation ); - return vbslq_f32( g_XMSelect1110, vResult, vColor ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f}; - XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance ); -// Splat fSaturation - XMVECTOR vSaturation = _mm_set_ps1(fSaturation); -// vResult = ((vColor-vLuminance)*vSaturation)+vLuminance; - XMVECTOR vResult = _mm_sub_ps(vColor,vLuminance); - vResult = _mm_mul_ps(vResult,vSaturation); - vResult = _mm_add_ps(vResult,vLuminance); -// Retain w from the source color - vLuminance = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w - vResult = _mm_shuffle_ps(vResult,vLuminance,_MM_SHUFFLE(3,0,1,0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorAdjustContrast -( - FXMVECTOR vColor, - float fContrast -) -{ - // Result = (vColor - 0.5f) * fContrast + 0.5f; - -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - ((vColor.vector4_f32[0]-0.5f) * fContrast) + 0.5f, - ((vColor.vector4_f32[1]-0.5f) * fContrast) + 0.5f, - ((vColor.vector4_f32[2]-0.5f) * fContrast) + 0.5f, - vColor.vector4_f32[3] // Leave W untouched - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v); - vResult = vmlaq_n_f32( g_XMOneHalf.v, vResult, fContrast ); - return vbslq_f32( g_XMSelect1110, vResult, vColor ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vScale = _mm_set_ps1(fContrast); // Splat the scale - XMVECTOR vResult = _mm_sub_ps(vColor,g_XMOneHalf); // Subtract 0.5f from the source (Saving source) - vResult = _mm_mul_ps(vResult,vScale); // Mul by scale - vResult = _mm_add_ps(vResult,g_XMOneHalf); // Add 0.5f -// Retain w from the source color - vScale = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w - vResult = _mm_shuffle_ps(vResult,vScale,_MM_SHUFFLE(3,0,1,0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorRGBToHSL( FXMVECTOR rgb ) -{ - XMVECTOR r = XMVectorSplatX( rgb ); - XMVECTOR g = XMVectorSplatY( rgb ); - XMVECTOR b = XMVectorSplatZ( rgb ); - - XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) ); - XMVECTOR max = XMVectorMax( r, XMVectorMax( g, b ) ); - - XMVECTOR l = XMVectorMultiply( XMVectorAdd( min, max ), g_XMOneHalf ); - - XMVECTOR d = XMVectorSubtract( max, min ); - - XMVECTOR la = XMVectorSelect( rgb, l, g_XMSelect1110 ); - - if ( XMVector3Less( d, g_XMEpsilon ) ) - { - // Achromatic, assume H and S of 0 - return XMVectorSelect( la, g_XMZero, g_XMSelect1100 ); - } - else - { - XMVECTOR s, h; - - XMVECTOR d2 = XMVectorAdd( min, max ); - - if ( XMVector3Greater( l, g_XMOneHalf ) ) - { - // d / (2-max-min) - s = XMVectorDivide( d, XMVectorSubtract( g_XMTwo, d2 ) ); - } - else - { - // d / (max+min) - s = XMVectorDivide( d, d2 ); - } - - if ( XMVector3Equal( r, max ) ) - { - // Red is max - h = XMVectorDivide( XMVectorSubtract( g, b ), d ); - } - else if ( XMVector3Equal( g, max ) ) - { - // Green is max - h = XMVectorDivide( XMVectorSubtract( b, r ), d ); - h = XMVectorAdd( h, g_XMTwo ); - } - else - { - // Blue is max - h = XMVectorDivide( XMVectorSubtract( r, g ), d ); - h = XMVectorAdd( h, g_XMFour ); - } - - h = XMVectorDivide( h, g_XMSix ); - - if ( XMVector3Less( h, g_XMZero ) ) - h = XMVectorAdd( h, g_XMOne ); - - XMVECTOR lha = XMVectorSelect( la, h, g_XMSelect1100 ); - return XMVectorSelect( s, lha, g_XMSelect1011 ); - } -} - -//------------------------------------------------------------------------------ - -namespace Internal -{ - -inline XMVECTOR XM_CALLCONV XMColorHue2Clr( FXMVECTOR p, FXMVECTOR q, FXMVECTOR h ) -{ - static const XMVECTORF32 oneSixth = { 1.0f/6.0f, 1.0f/6.0f, 1.0f/6.0f, 1.0f/6.0f }; - static const XMVECTORF32 twoThirds = { 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f }; - - XMVECTOR t = h; - - if ( XMVector3Less( t, g_XMZero ) ) - t = XMVectorAdd( t, g_XMOne ); - - if ( XMVector3Greater( t, g_XMOne ) ) - t = XMVectorSubtract( t, g_XMOne ); - - if ( XMVector3Less( t, oneSixth ) ) - { - // p + (q - p) * 6 * t - XMVECTOR t1 = XMVectorSubtract( q, p ); - XMVECTOR t2 = XMVectorMultiply( g_XMSix, t ); - return XMVectorMultiplyAdd( t1, t2, p ); - } - - if ( XMVector3Less( t, g_XMOneHalf ) ) - return q; - - if ( XMVector3Less( t, twoThirds ) ) - { - // p + (q - p) * 6 * (2/3 - t) - XMVECTOR t1 = XMVectorSubtract( q, p ); - XMVECTOR t2 = XMVectorMultiply( g_XMSix, XMVectorSubtract( twoThirds, t ) ); - return XMVectorMultiplyAdd( t1, t2, p ); - } - - return p; -} - -}; // namespace Internal - -inline XMVECTOR XM_CALLCONV XMColorHSLToRGB( FXMVECTOR hsl ) -{ - static const XMVECTORF32 oneThird = { 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f }; - - XMVECTOR s = XMVectorSplatY( hsl ); - XMVECTOR l = XMVectorSplatZ( hsl ); - - if ( XMVector3NearEqual( s, g_XMZero, g_XMEpsilon ) ) - { - // Achromatic - return XMVectorSelect( hsl, l, g_XMSelect1110 ); - } - else - { - XMVECTOR h = XMVectorSplatX( hsl ); - - XMVECTOR q; - if ( XMVector3Less( l, g_XMOneHalf ) ) - { - q = XMVectorMultiply( l, XMVectorAdd ( g_XMOne, s ) ); - } - else - { - q = XMVectorSubtract( XMVectorAdd( l, s ), XMVectorMultiply( l, s ) ); - } - - XMVECTOR p = XMVectorSubtract( XMVectorMultiply( g_XMTwo, l ), q ); - - XMVECTOR r = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorAdd( h, oneThird ) ); - XMVECTOR g = DirectX::Internal::XMColorHue2Clr( p, q, h ); - XMVECTOR b = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorSubtract( h, oneThird ) ); - - XMVECTOR rg = XMVectorSelect( g, r, g_XMSelect1000 ); - XMVECTOR ba = XMVectorSelect( hsl, b, g_XMSelect1110 ); - - return XMVectorSelect( ba, rg, g_XMSelect1100 ); - } -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorRGBToHSV( FXMVECTOR rgb ) -{ - XMVECTOR r = XMVectorSplatX( rgb ); - XMVECTOR g = XMVectorSplatY( rgb ); - XMVECTOR b = XMVectorSplatZ( rgb ); - - XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) ); - XMVECTOR v = XMVectorMax( r, XMVectorMax( g, b ) ); - - XMVECTOR d = XMVectorSubtract( v, min ); - - XMVECTOR s = ( XMVector3NearEqual( v, g_XMZero, g_XMEpsilon ) ) ? g_XMZero : XMVectorDivide( d, v ); - - if ( XMVector3Less( d, g_XMEpsilon ) ) - { - // Achromatic, assume H of 0 - XMVECTOR hv = XMVectorSelect( v, g_XMZero, g_XMSelect1000 ); - XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 ); - return XMVectorSelect( s, hva, g_XMSelect1011 ); - } - else - { - XMVECTOR h; - - if ( XMVector3Equal( r, v ) ) - { - // Red is max - h = XMVectorDivide( XMVectorSubtract( g, b ), d ); - - if ( XMVector3Less( g, b ) ) - h = XMVectorAdd( h, g_XMSix ); - } - else if ( XMVector3Equal( g, v ) ) - { - // Green is max - h = XMVectorDivide( XMVectorSubtract( b, r ), d ); - h = XMVectorAdd( h, g_XMTwo ); - } - else - { - // Blue is max - h = XMVectorDivide( XMVectorSubtract( r, g ), d ); - h = XMVectorAdd( h, g_XMFour ); - } - - h = XMVectorDivide( h, g_XMSix ); - - XMVECTOR hv = XMVectorSelect( v, h, g_XMSelect1000 ); - XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 ); - return XMVectorSelect( s, hva, g_XMSelect1011 ); - } -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorHSVToRGB( FXMVECTOR hsv ) -{ - XMVECTOR h = XMVectorSplatX( hsv ); - XMVECTOR s = XMVectorSplatY( hsv ); - XMVECTOR v = XMVectorSplatZ( hsv ); - - XMVECTOR h6 = XMVectorMultiply( h, g_XMSix ); - - XMVECTOR i = XMVectorFloor( h6 ); - XMVECTOR f = XMVectorSubtract( h6, i ); - - // p = v* (1-s) - XMVECTOR p = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, s ) ); - - // q = v*(1-f*s) - XMVECTOR q = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( f, s ) ) ); - - // t = v*(1 - (1-f)*s) - XMVECTOR t = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( XMVectorSubtract( g_XMOne, f ), s ) ) ); - - int ii = static_cast( XMVectorGetX( XMVectorMod( i, g_XMSix ) ) ); - - XMVECTOR _rgb; - - switch (ii) - { - case 0: // rgb = vtp - { - XMVECTOR vt = XMVectorSelect( t, v, g_XMSelect1000 ); - _rgb = XMVectorSelect( p, vt, g_XMSelect1100 ); - } - break; - case 1: // rgb = qvp - { - XMVECTOR qv = XMVectorSelect( v, q, g_XMSelect1000 ); - _rgb = XMVectorSelect( p, qv, g_XMSelect1100 ); - } - break; - case 2: // rgb = pvt - { - XMVECTOR pv = XMVectorSelect( v, p, g_XMSelect1000 ); - _rgb = XMVectorSelect( t, pv, g_XMSelect1100 ); - } - break; - case 3: // rgb = pqv - { - XMVECTOR pq = XMVectorSelect( q, p, g_XMSelect1000 ); - _rgb = XMVectorSelect( v, pq, g_XMSelect1100 ); - } - break; - case 4: // rgb = tpv - { - XMVECTOR tp = XMVectorSelect( p, t, g_XMSelect1000 ); - _rgb = XMVectorSelect( v, tp, g_XMSelect1100 ); - } - break; - default: // rgb = vpq - { - XMVECTOR vp = XMVectorSelect( p, v, g_XMSelect1000 ); - _rgb = XMVectorSelect( q, vp, g_XMSelect1100 ); - } - break; - } - - return XMVectorSelect( hsv, _rgb, g_XMSelect1110 ); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorRGBToYUV( FXMVECTOR rgb ) -{ - static const XMVECTORF32 Scale0 = { 0.299f, -0.147f, 0.615f, 0.0f }; - static const XMVECTORF32 Scale1 = { 0.587f, -0.289f, -0.515f, 0.0f }; - static const XMVECTORF32 Scale2 = { 0.114f, 0.436f, -0.100f, 0.0f }; - - XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); - XMVECTOR clr = XMVector3Transform( rgb, M ); - - return XMVectorSelect( rgb, clr, g_XMSelect1110 ); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorYUVToRGB( FXMVECTOR yuv ) -{ - static const XMVECTORF32 Scale1 = { 0.0f, -0.395f, 2.032f, 0.0f }; - static const XMVECTORF32 Scale2 = { 1.140f, -0.581f, 0.0f, 0.0f }; - - XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero ); - XMVECTOR clr = XMVector3Transform( yuv, M ); - - return XMVectorSelect( yuv, clr, g_XMSelect1110 ); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD( FXMVECTOR rgb ) -{ - static const XMVECTORF32 Scale0 = { 0.2126f, -0.0997f, 0.6150f, 0.0f }; - static const XMVECTORF32 Scale1 = { 0.7152f, -0.3354f, -0.5586f, 0.0f }; - static const XMVECTORF32 Scale2 = { 0.0722f, 0.4351f, -0.0564f, 0.0f }; - - XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); - XMVECTOR clr = XMVector3Transform( rgb, M ); - - return XMVectorSelect( rgb, clr, g_XMSelect1110 ); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD( FXMVECTOR yuv ) -{ - static const XMVECTORF32 Scale1 = { 0.0f, -0.2153f, 2.1324f, 0.0f }; - static const XMVECTORF32 Scale2 = { 1.2803f, -0.3806f, 0.0f, 0.0f }; - - XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero ); - XMVECTOR clr = XMVector3Transform( yuv, M ); - - return XMVectorSelect( yuv, clr, g_XMSelect1110 ); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorRGBToXYZ( FXMVECTOR rgb ) -{ - static const XMVECTORF32 Scale0 = { 0.4887180f, 0.1762044f, 0.0000000f, 0.0f }; - static const XMVECTORF32 Scale1 = { 0.3106803f, 0.8129847f, 0.0102048f, 0.0f }; - static const XMVECTORF32 Scale2 = { 0.2006017f, 0.0108109f, 0.9897952f, 0.0f }; - static const XMVECTORF32 Scale = { 1.f/0.17697f, 1.f/0.17697f, 1.f/0.17697f, 0.0f }; - - XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); - XMVECTOR clr = XMVectorMultiply( XMVector3Transform( rgb, M ), Scale ); - - return XMVectorSelect( rgb, clr, g_XMSelect1110 ); -} - -inline XMVECTOR XM_CALLCONV XMColorXYZToRGB( FXMVECTOR xyz ) -{ - static const XMVECTORF32 Scale0 = { 2.3706743f, -0.5138850f, 0.0052982f, 0.0f }; - static const XMVECTORF32 Scale1 = { -0.9000405f, 1.4253036f, -0.0146949f, 0.0f }; - static const XMVECTORF32 Scale2 = { -0.4706338f, 0.0885814f, 1.0093968f, 0.0f }; - static const XMVECTORF32 Scale = { 0.17697f, 0.17697f, 0.17697f, 0.0f }; - - XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); - XMVECTOR clr = XMVector3Transform( XMVectorMultiply( xyz, Scale ), M ); - - return XMVectorSelect( xyz, clr, g_XMSelect1110 ); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorXYZToSRGB( FXMVECTOR xyz ) -{ - static const XMVECTORF32 Scale0 = { 3.2406f, -0.9689f, 0.0557f, 0.0f }; - static const XMVECTORF32 Scale1 = { -1.5372f, 1.8758f, -0.2040f, 0.0f }; - static const XMVECTORF32 Scale2 = { -0.4986f, 0.0415f, 1.0570f, 0.0f }; - static const XMVECTORF32 Cutoff = { 0.0031308f, 0.0031308f, 0.0031308f, 0.0f }; - static const XMVECTORF32 Exp = { 1.0f/2.4f, 1.0f/2.4f, 1.0f/2.4f, 1.0f }; - - XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); - XMVECTOR lclr = XMVector3Transform( xyz, M ); - - XMVECTOR sel = XMVectorGreater( lclr, Cutoff ); - - // clr = 12.92 * lclr for lclr <= 0.0031308f - XMVECTOR smallC = XMVectorMultiply( lclr, g_XMsrgbScale ); - - // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055) - XMVECTOR largeC = XMVectorSubtract( XMVectorMultiply( g_XMsrgbA1, XMVectorPow( lclr, Exp ) ), g_XMsrgbA ); - - XMVECTOR clr = XMVectorSelect( smallC, largeC, sel ); - - return XMVectorSelect( xyz, clr, g_XMSelect1110 ); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorSRGBToXYZ( FXMVECTOR srgb ) -{ - static const XMVECTORF32 Scale0 = { 0.4124f, 0.2126f, 0.0193f, 0.0f }; - static const XMVECTORF32 Scale1 = { 0.3576f, 0.7152f, 0.1192f, 0.0f }; - static const XMVECTORF32 Scale2 = { 0.1805f, 0.0722f, 0.9505f, 0.0f }; - static const XMVECTORF32 Cutoff = { 0.04045f, 0.04045f, 0.04045f, 0.0f }; - static const XMVECTORF32 Exp = { 2.4f, 2.4f, 2.4f, 1.0f }; - - XMVECTOR sel = XMVectorGreater( srgb, Cutoff ); - - // lclr = clr / 12.92 - XMVECTOR smallC = XMVectorDivide( srgb, g_XMsrgbScale ); - - // lclr = pow( (clr + a) / (1+a), 2.4 ) - XMVECTOR largeC = XMVectorPow( XMVectorDivide( XMVectorAdd( srgb, g_XMsrgbA ), g_XMsrgbA1 ), Exp ); - - XMVECTOR lclr = XMVectorSelect( smallC, largeC, sel ); - - XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); - XMVECTOR clr = XMVector3Transform( lclr, M ); - - return XMVectorSelect( srgb, clr, g_XMSelect1110 ); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorRGBToSRGB( FXMVECTOR rgb ) -{ - static const XMVECTORF32 Cutoff = { 0.0031308f, 0.0031308f, 0.0031308f, 1.f }; - static const XMVECTORF32 Linear = { 12.92f, 12.92f, 12.92f, 1.f }; - static const XMVECTORF32 Scale = { 1.055f, 1.055f, 1.055f, 1.f }; - static const XMVECTORF32 Bias = { 0.055f, 0.055f, 0.055f, 0.f }; - static const XMVECTORF32 InvGamma = { 1.0f/2.4f, 1.0f/2.4f, 1.0f/2.4f, 1.f }; - - XMVECTOR V = XMVectorSaturate(rgb); - XMVECTOR V0 = XMVectorMultiply( V, Linear ); - XMVECTOR V1 = Scale * XMVectorPow( V, InvGamma ) - Bias; - XMVECTOR select = XMVectorLess( V, Cutoff ); - V = XMVectorSelect( V1, V0, select ); - return XMVectorSelect( rgb, V, g_XMSelect1110 ); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb ) -{ - static const XMVECTORF32 Cutoff = { 0.04045f, 0.04045f, 0.04045f, 1.f }; - static const XMVECTORF32 ILinear = { 1.f/12.92f, 1.f/12.92f, 1.f/12.92f, 1.f }; - static const XMVECTORF32 Scale = { 1.f/1.055f, 1.f/1.055f, 1.f/1.055f, 1.f }; - static const XMVECTORF32 Bias = { 0.055f, 0.055f, 0.055f, 0.f }; - static const XMVECTORF32 Gamma = { 2.4f, 2.4f, 2.4f, 1.f }; - - XMVECTOR V = XMVectorSaturate(srgb); - XMVECTOR V0 = XMVectorMultiply( V, ILinear ); - XMVECTOR V1 = XMVectorPow( (V + Bias) * Scale, Gamma ); - XMVECTOR select = XMVectorGreater( V, Cutoff ); - V = XMVectorSelect( V0, V1, select ); - return XMVectorSelect( srgb, V, g_XMSelect1110 ); -} - -/**************************************************************************** - * - * Miscellaneous - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline bool XMVerifyCPUSupport() -{ -#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - int CPUInfo[4] = { -1 }; - __cpuid(CPUInfo, 0); - -#ifdef __AVX2__ - if (CPUInfo[0] < 7) - return false; -#else - if (CPUInfo[0] < 1) - return false; -#endif - - __cpuid(CPUInfo, 1); - -#ifdef __AVX2__ - // The compiler can emit FMA3 instructions even without explicit intrinsics use - if ((CPUInfo[2] & 0x38081001) != 0x38081001) - return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support -#elif defined(_XM_F16C_INTRINSICS_) - if ((CPUInfo[2] & 0x38080001) != 0x38080001) - return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support -#elif defined(__AVX__) || defined(_XM_AVX_INTRINSICS_) - if ((CPUInfo[2] & 0x18080001) != 0x18080001) - return false; // No AVX/OSXSAVE/SSE4.1/SSE3 support -#elif defined(_XM_SSE4_INTRINSICS_) - if ((CPUInfo[2] & 0x80001) != 0x80001) - return false; // No SSE3/SSE4.1 support -#elif defined(_XM_SSE3_INTRINSICS_) - if (!(CPUInfo[2] & 0x1)) - return false; // No SSE3 support -#endif - - // The x64 processor model requires SSE2 support, but no harm in checking - if ((CPUInfo[3] & 0x6000000) != 0x6000000) - return false; // No SSE2/SSE support - -#ifdef __AVX2__ - __cpuidex(CPUInfo, 7, 0); - if (!(CPUInfo[1] & 0x20)) - return false; // No AVX2 support -#endif - - return true; -#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - // ARM-NEON support is required for the Windows on ARM platform - return true; -#else - // No intrinsics path always supported - return true; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMFresnelTerm -( - FXMVECTOR CosIncidentAngle, - FXMVECTOR RefractionIndex -) -{ - assert(!XMVector4IsInfinite(CosIncidentAngle)); - - // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where - // c = CosIncidentAngle - // g = sqrt(c^2 + RefractionIndex^2 - 1) - -#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) - - XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v); - G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G); - G = XMVectorAbs(G); - G = XMVectorSqrt(G); - - XMVECTOR S = XMVectorAdd(G, CosIncidentAngle); - XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle); - - XMVECTOR V0 = XMVectorMultiply(D, D); - XMVECTOR V1 = XMVectorMultiply(S, S); - V1 = XMVectorReciprocal(V1); - V0 = XMVectorMultiply(g_XMOneHalf.v, V0); - V0 = XMVectorMultiply(V0, V1); - - XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v); - XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v); - V2 = XMVectorMultiply(V2, V2); - V3 = XMVectorMultiply(V3, V3); - V3 = XMVectorReciprocal(V3); - V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v); - - XMVECTOR Result = XMVectorMultiply(V0, V2); - - Result = XMVectorSaturate(Result); - - return Result; - -#elif defined(_XM_SSE_INTRINSICS_) - // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2)) - XMVECTOR G = _mm_mul_ps(RefractionIndex,RefractionIndex); - XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle,CosIncidentAngle); - G = _mm_sub_ps(G,g_XMOne); - vTemp = _mm_add_ps(vTemp,G); - // max((0-vTemp),vTemp) == abs(vTemp) - // The abs is needed to deal with refraction and cosine being zero - G = _mm_setzero_ps(); - G = _mm_sub_ps(G,vTemp); - G = _mm_max_ps(G,vTemp); - // Last operation, the sqrt() - G = _mm_sqrt_ps(G); - - // Calc G-C and G+C - XMVECTOR GAddC = _mm_add_ps(G,CosIncidentAngle); - XMVECTOR GSubC = _mm_sub_ps(G,CosIncidentAngle); - // Perform the term (0.5f *(g - c)^2) / (g + c)^2 - XMVECTOR vResult = _mm_mul_ps(GSubC,GSubC); - vTemp = _mm_mul_ps(GAddC,GAddC); - vResult = _mm_mul_ps(vResult,g_XMOneHalf); - vResult = _mm_div_ps(vResult,vTemp); - // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) - GAddC = _mm_mul_ps(GAddC,CosIncidentAngle); - GSubC = _mm_mul_ps(GSubC,CosIncidentAngle); - GAddC = _mm_sub_ps(GAddC,g_XMOne); - GSubC = _mm_add_ps(GSubC,g_XMOne); - GAddC = _mm_mul_ps(GAddC,GAddC); - GSubC = _mm_mul_ps(GSubC,GSubC); - GAddC = _mm_div_ps(GAddC,GSubC); - GAddC = _mm_add_ps(GAddC,g_XMOne); - // Multiply the two term parts - vResult = _mm_mul_ps(vResult,GAddC); - // Clamp to 0.0 - 1.0f - vResult = _mm_max_ps(vResult,g_XMZero); - vResult = _mm_min_ps(vResult,g_XMOne); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XMScalarNearEqual -( - float S1, - float S2, - float Epsilon -) -{ - float Delta = S1 - S2; - return (fabsf(Delta) <= Epsilon); -} - -//------------------------------------------------------------------------------ -// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI -inline float XMScalarModAngle -( - float Angle -) -{ - // Note: The modulo is performed with unsigned math only to work - // around a precision error on numbers that are close to PI - - // Normalize the range from 0.0f to XM_2PI - Angle = Angle + XM_PI; - // Perform the modulo, unsigned - float fTemp = fabsf(Angle); - fTemp = fTemp - (XM_2PI * (float)((int32_t)(fTemp/XM_2PI))); - // Restore the number to the range of -XM_PI to XM_PI-epsilon - fTemp = fTemp - XM_PI; - // If the modulo'd value was negative, restore negation - if (Angle<0.0f) { - fTemp = -fTemp; - } - return fTemp; -} - -//------------------------------------------------------------------------------ - -inline float XMScalarSin -( - float Value -) -{ - // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. - float quotient = XM_1DIV2PI*Value; - if (Value >= 0.0f) - { - quotient = (float)((int)(quotient + 0.5f)); - } - else - { - quotient = (float)((int)(quotient - 0.5f)); - } - float y = Value - XM_2PI*quotient; - - // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). - if (y > XM_PIDIV2) - { - y = XM_PI - y; - } - else if (y < -XM_PIDIV2) - { - y = -XM_PI - y; - } - - // 11-degree minimax approximation - float y2 = y * y; - return ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y; -} - -//------------------------------------------------------------------------------ - -inline float XMScalarSinEst -( - float Value -) -{ - // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. - float quotient = XM_1DIV2PI*Value; - if (Value >= 0.0f) - { - quotient = (float)((int)(quotient + 0.5f)); - } - else - { - quotient = (float)((int)(quotient - 0.5f)); - } - float y = Value - XM_2PI*quotient; - - // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). - if (y > XM_PIDIV2) - { - y = XM_PI - y; - } - else if (y < -XM_PIDIV2) - { - y = -XM_PI - y; - } - - // 7-degree minimax approximation - float y2 = y * y; - return ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y; -} - -//------------------------------------------------------------------------------ - -inline float XMScalarCos -( - float Value -) -{ - // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. - float quotient = XM_1DIV2PI*Value; - if (Value >= 0.0f) - { - quotient = (float)((int)(quotient + 0.5f)); - } - else - { - quotient = (float)((int)(quotient - 0.5f)); - } - float y = Value - XM_2PI*quotient; - - // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). - float sign; - if (y > XM_PIDIV2) - { - y = XM_PI - y; - sign = -1.0f; - } - else if (y < -XM_PIDIV2) - { - y = -XM_PI - y; - sign = -1.0f; - } - else - { - sign = +1.0f; - } - - // 10-degree minimax approximation - float y2 = y*y; - float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f; - return sign*p; -} - -//------------------------------------------------------------------------------ - -inline float XMScalarCosEst -( - float Value -) -{ - // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. - float quotient = XM_1DIV2PI*Value; - if (Value >= 0.0f) - { - quotient = (float)((int)(quotient + 0.5f)); - } - else - { - quotient = (float)((int)(quotient - 0.5f)); - } - float y = Value - XM_2PI*quotient; - - // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). - float sign; - if (y > XM_PIDIV2) - { - y = XM_PI - y; - sign = -1.0f; - } - else if (y < -XM_PIDIV2) - { - y = -XM_PI - y; - sign = -1.0f; - } - else - { - sign = +1.0f; - } - - // 6-degree minimax approximation - float y2 = y * y; - float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f; - return sign*p; -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline void XMScalarSinCos -( - float* pSin, - float* pCos, - float Value -) -{ - assert(pSin); - assert(pCos); - - // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. - float quotient = XM_1DIV2PI*Value; - if (Value >= 0.0f) - { - quotient = (float)((int)(quotient + 0.5f)); - } - else - { - quotient = (float)((int)(quotient - 0.5f)); - } - float y = Value - XM_2PI*quotient; - - // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). - float sign; - if (y > XM_PIDIV2) - { - y = XM_PI - y; - sign = -1.0f; - } - else if (y < -XM_PIDIV2) - { - y = -XM_PI - y; - sign = -1.0f; - } - else - { - sign = +1.0f; - } - - float y2 = y * y; - - // 11-degree minimax approximation - *pSin = ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y; - - // 10-degree minimax approximation - float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f; - *pCos = sign*p; -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline void XMScalarSinCosEst -( - float* pSin, - float* pCos, - float Value -) -{ - assert(pSin); - assert(pCos); - - // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. - float quotient = XM_1DIV2PI*Value; - if (Value >= 0.0f) - { - quotient = (float)((int)(quotient + 0.5f)); - } - else - { - quotient = (float)((int)(quotient - 0.5f)); - } - float y = Value - XM_2PI*quotient; - - // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). - float sign; - if (y > XM_PIDIV2) - { - y = XM_PI - y; - sign = -1.0f; - } - else if (y < -XM_PIDIV2) - { - y = -XM_PI - y; - sign = -1.0f; - } - else - { - sign = +1.0f; - } - - float y2 = y * y; - - // 7-degree minimax approximation - *pSin = ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y; - - // 6-degree minimax approximation - float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f; - *pCos = sign*p; -} - -//------------------------------------------------------------------------------ - -inline float XMScalarASin -( - float Value -) -{ - // Clamp input to [-1,1]. - bool nonnegative = (Value >= 0.0f); - float x = fabsf(Value); - float omx = 1.0f - x; - if (omx < 0.0f) - { - omx = 0.0f; - } - float root = sqrtf(omx); - - // 7-degree minimax approximation - float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f; - result *= root; // acos(|x|) - - // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) - return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); -} - -//------------------------------------------------------------------------------ - -inline float XMScalarASinEst -( - float Value -) -{ - // Clamp input to [-1,1]. - bool nonnegative = (Value >= 0.0f); - float x = fabsf(Value); - float omx = 1.0f - x; - if (omx < 0.0f) - { - omx = 0.0f; - } - float root = sqrtf(omx); - - // 3-degree minimax approximation - float result = ((-0.0187293f*x+0.0742610f)*x-0.2121144f)*x+1.5707288f; - result *= root; // acos(|x|) - - // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) - return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); -} - -//------------------------------------------------------------------------------ - -inline float XMScalarACos -( - float Value -) -{ - // Clamp input to [-1,1]. - bool nonnegative = (Value >= 0.0f); - float x = fabsf(Value); - float omx = 1.0f - x; - if (omx < 0.0f) - { - omx = 0.0f; - } - float root = sqrtf(omx); - - // 7-degree minimax approximation - float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f; - result *= root; - - // acos(x) = pi - acos(-x) when x < 0 - return (nonnegative ? result : XM_PI - result); -} - -//------------------------------------------------------------------------------ - -inline float XMScalarACosEst -( - float Value -) -{ - // Clamp input to [-1,1]. - bool nonnegative = (Value >= 0.0f); - float x = fabsf(Value); - float omx = 1.0f - x; - if (omx < 0.0f) - { - omx = 0.0f; - } - float root = sqrtf(omx); - - // 3-degree minimax approximation - float result = ( ( -0.0187293f * x + 0.0742610f ) * x - 0.2121144f ) * x + 1.5707288f; - result *= root; - - // acos(x) = pi - acos(-x) when x < 0 - return (nonnegative ? result : XM_PI - result); -} - +//------------------------------------------------------------------------------------- +// DirectXMathMisc.inl -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Quaternion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + return XMVector4Equal(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionNotEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + return XMVector4NotEqual(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsNaN +( + FXMVECTOR Q +) +{ + return XMVector4IsNaN(Q); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsInfinite +( + FXMVECTOR Q +) +{ + return XMVector4IsInfinite(Q); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMQuaternionIsIdentity +( + FXMVECTOR Q +) +{ + return XMVector4Equal(Q, g_XMIdentityR3.v); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionDot +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + return XMVector4Dot(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionMultiply +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2) + + // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y), + // (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x), + // (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w), + // (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result = { + (Q2.vector4_f32[3] * Q1.vector4_f32[0]) + (Q2.vector4_f32[0] * Q1.vector4_f32[3]) + (Q2.vector4_f32[1] * Q1.vector4_f32[2]) - (Q2.vector4_f32[2] * Q1.vector4_f32[1]), + (Q2.vector4_f32[3] * Q1.vector4_f32[1]) - (Q2.vector4_f32[0] * Q1.vector4_f32[2]) + (Q2.vector4_f32[1] * Q1.vector4_f32[3]) + (Q2.vector4_f32[2] * Q1.vector4_f32[0]), + (Q2.vector4_f32[3] * Q1.vector4_f32[2]) + (Q2.vector4_f32[0] * Q1.vector4_f32[1]) - (Q2.vector4_f32[1] * Q1.vector4_f32[0]) + (Q2.vector4_f32[2] * Q1.vector4_f32[3]), + (Q2.vector4_f32[3] * Q1.vector4_f32[3]) - (Q2.vector4_f32[0] * Q1.vector4_f32[0]) - (Q2.vector4_f32[1] * Q1.vector4_f32[1]) - (Q2.vector4_f32[2] * Q1.vector4_f32[2]) }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f}; + static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f}; + static const XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f}; + + float32x2_t Q2L = vget_low_f32(Q2); + float32x2_t Q2H = vget_high_f32(Q2); + + float32x4_t Q2X = vdupq_lane_f32( Q2L, 0 ); + float32x4_t Q2Y = vdupq_lane_f32( Q2L, 1 ); + float32x4_t Q2Z = vdupq_lane_f32( Q2H, 0 ); + XMVECTOR vResult = vmulq_lane_f32(Q1, Q2H, 1); + + // Mul by Q1WZYX + float32x4_t vTemp = vrev64q_f32(Q1); + vTemp = vcombine_f32( vget_high_f32(vTemp), vget_low_f32(vTemp) ); + Q2X = vmulq_f32(Q2X,vTemp); + vResult = vmlaq_f32( vResult, Q2X, ControlWZYX ); + + // Mul by Q1ZWXY + vTemp = vrev64q_u32(vTemp); + Q2Y = vmulq_f32(Q2Y,vTemp); + vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY); + + // Mul by Q1YXWZ + vTemp = vrev64q_u32(vTemp); + vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp)); + Q2Z = vmulq_f32(Q2Z,vTemp); + vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f}; + static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f}; + static const XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f}; + // Copy to SSE registers and use as few as possible for x86 + XMVECTOR Q2X = Q2; + XMVECTOR Q2Y = Q2; + XMVECTOR Q2Z = Q2; + XMVECTOR vResult = Q2; + // Splat with one instruction + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,3,3,3)); + Q2X = XM_PERMUTE_PS(Q2X,_MM_SHUFFLE(0,0,0,0)); + Q2Y = XM_PERMUTE_PS(Q2Y,_MM_SHUFFLE(1,1,1,1)); + Q2Z = XM_PERMUTE_PS(Q2Z,_MM_SHUFFLE(2,2,2,2)); + // Retire Q1 and perform Q1*Q2W + vResult = _mm_mul_ps(vResult,Q1); + XMVECTOR Q1Shuffle = Q1; + // Shuffle the copies of Q1 + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3)); + // Mul by Q1WZYX + Q2X = _mm_mul_ps(Q2X,Q1Shuffle); + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(2,3,0,1)); + // Flip the signs on y and z + Q2X = _mm_mul_ps(Q2X,ControlWZYX); + // Mul by Q1ZWXY + Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle); + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3)); + // Flip the signs on z and w + Q2Y = _mm_mul_ps(Q2Y,ControlZWXY); + // Mul by Q1YXWZ + Q2Z = _mm_mul_ps(Q2Z,Q1Shuffle); + vResult = _mm_add_ps(vResult,Q2X); + // Flip the signs on x and w + Q2Z = _mm_mul_ps(Q2Z,ControlYXWZ); + Q2Y = _mm_add_ps(Q2Y,Q2Z); + vResult = _mm_add_ps(vResult,Q2Y); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLengthSq +( + FXMVECTOR Q +) +{ + return XMVector4LengthSq(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength +( + FXMVECTOR Q +) +{ + return XMVector4ReciprocalLength(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLength +( + FXMVECTOR Q +) +{ + return XMVector4Length(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst +( + FXMVECTOR Q +) +{ + return XMVector4NormalizeEst(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionNormalize +( + FXMVECTOR Q +) +{ + return XMVector4Normalize(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionConjugate +( + FXMVECTOR Q +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result = { + -Q.vector4_f32[0], + -Q.vector4_f32[1], + -Q.vector4_f32[2], + Q.vector4_f32[3] + }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f}; + return vmulq_f32(Q, NegativeOne3.v ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f}; + return _mm_mul_ps(Q,NegativeOne3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionInverse +( + FXMVECTOR Q +) +{ + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR L = XMVector4LengthSq(Q); + XMVECTOR Conjugate = XMQuaternionConjugate(Q); + + XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v); + + XMVECTOR Result = XMVectorDivide(Conjugate, L); + + Result = XMVectorSelect(Result, Zero, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionLn +( + FXMVECTOR Q +) +{ + static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; + + XMVECTOR QW = XMVectorSplatW(Q); + XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v); + + XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v); + + XMVECTOR Theta = XMVectorACos(QW); + XMVECTOR SinTheta = XMVectorSin(Theta); + + XMVECTOR S = XMVectorDivide(Theta,SinTheta); + + XMVECTOR Result = XMVectorMultiply(Q0, S); + Result = XMVectorSelect(Q0, Result, ControlW); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionExp +( + FXMVECTOR Q +) +{ + XMVECTOR Theta = XMVector3Length(Q); + + XMVECTOR SinTheta, CosTheta; + XMVectorSinCos(&SinTheta, &CosTheta, Theta); + + XMVECTOR S = XMVectorDivide(SinTheta, Theta); + + XMVECTOR Result = XMVectorMultiply(Q, S); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v); + Result = XMVectorSelect(Result, Q, Control); + + Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSlerp +( + FXMVECTOR Q0, + FXMVECTOR Q1, + float t +) +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSlerpV(Q0, Q1, T); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSlerpV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR T +) +{ + assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T))); + + // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; + + XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorLess(CosOmega, Zero); + XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control); + + CosOmega = XMVectorMultiply(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + XMVECTOR SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v); + SinOmega = XMVectorSqrt(SinOmega); + + XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); + + XMVECTOR SignMask = XMVectorSplatSignMask(); + XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2); + SignMask = XMVectorShiftLeft(SignMask, Zero, 3); + V01 = XMVectorXorInt(V01, SignMask); + V01 = XMVectorAdd(g_XMIdentityR0.v, V01); + + XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega); + + XMVECTOR S0 = XMVectorMultiply(V01, Omega); + S0 = XMVectorSin(S0); + S0 = XMVectorMultiply(S0, InvSinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + XMVECTOR S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = XMVectorMultiply(S1, Sign); + + XMVECTOR Result = XMVectorMultiply(Q0, S0); + Result = XMVectorMultiplyAdd(Q1, S1, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; + static const XMVECTORU32 SignMask2 = {0x80000000,0x00000000,0x00000000,0x00000000}; + + XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorLess(CosOmega, Zero); + XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control); + + CosOmega = _mm_mul_ps(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + XMVECTOR SinOmega = _mm_mul_ps(CosOmega,CosOmega); + SinOmega = _mm_sub_ps(g_XMOne,SinOmega); + SinOmega = _mm_sqrt_ps(SinOmega); + + XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); + + XMVECTOR V01 = XM_PERMUTE_PS(T,_MM_SHUFFLE(2,3,0,1)); + V01 = _mm_and_ps(V01,g_XMMaskXY); + V01 = _mm_xor_ps(V01,SignMask2); + V01 = _mm_add_ps(g_XMIdentityR0, V01); + + XMVECTOR S0 = _mm_mul_ps(V01, Omega); + S0 = XMVectorSin(S0); + S0 = _mm_div_ps(S0, SinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + XMVECTOR S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = _mm_mul_ps(S1, Sign); + XMVECTOR Result = _mm_mul_ps(Q0, S0); + S1 = _mm_mul_ps(S1, Q1); + Result = _mm_add_ps(Result,S1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSquad +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3, + float t +) +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionSquadV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3, + HXMVECTOR T +) +{ + assert( (XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)) ); + + XMVECTOR TP = T; + const XMVECTOR Two = XMVectorSplatConstant(2, 0); + + XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T); + XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T); + + TP = XMVectorNegativeMultiplySubtract(TP, TP, TP); + TP = XMVectorMultiply(TP, Two); + + XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP); + + return Result; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMQuaternionSquadSetup +( + XMVECTOR* pA, + XMVECTOR* pB, + XMVECTOR* pC, + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3 +) +{ + assert(pA); + assert(pB); + assert(pC); + + XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2)); + XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2)); + XMVECTOR SQ2 = XMVectorNegate(Q2); + + XMVECTOR Control1 = XMVectorLess(LS12, LD12); + SQ2 = XMVectorSelect(Q2, SQ2, Control1); + + XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1)); + XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1)); + XMVECTOR SQ0 = XMVectorNegate(Q0); + + XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3)); + XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3)); + XMVECTOR SQ3 = XMVectorNegate(Q3); + + XMVECTOR Control0 = XMVectorLess(LS01, LD01); + XMVECTOR Control2 = XMVectorLess(LS23, LD23); + + SQ0 = XMVectorSelect(Q0, SQ0, Control0); + SQ3 = XMVectorSelect(Q3, SQ3, Control2); + + XMVECTOR InvQ1 = XMQuaternionInverse(Q1); + XMVECTOR InvQ2 = XMQuaternionInverse(SQ2); + + XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0)); + XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2)); + XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1)); + XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3)); + + const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2); + + XMVECTOR ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter); + XMVECTOR ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter); + ExpQ02 = XMQuaternionExp(ExpQ02); + ExpQ13 = XMQuaternionExp(ExpQ13); + + *pA = XMQuaternionMultiply(Q1, ExpQ02); + *pB = XMQuaternionMultiply(SQ2, ExpQ13); + *pC = SQ2; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentric +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + float f, + float g +) +{ + float s = f + g; + + XMVECTOR Result; + if ((s < 0.00001f) && (s > -0.00001f)) + { + Result = Q0; + } + else + { + XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s); + XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s); + + Result = XMQuaternionSlerp(Q01, Q02, g / s); + } + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR F, + HXMVECTOR G +) +{ + assert( (XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)) ); + assert( (XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)) ); + + const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16); + + XMVECTOR S = XMVectorAdd(F, G); + + XMVECTOR Result; + if (XMVector4InBounds(S, Epsilon)) + { + Result = Q0; + } + else + { + XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S); + XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S); + XMVECTOR GS = XMVectorReciprocal(S); + GS = XMVectorMultiply(G, GS); + + Result = XMQuaternionSlerpV(Q01, Q02, GS); + } + + return Result; +} + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionIdentity() +{ + return g_XMIdentityR3.v; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw +( + float Pitch, + float Yaw, + float Roll +) +{ + XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles); + return Q; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector +( + FXMVECTOR Angles // +) +{ + static const XMVECTORF32 Sign = {1.0f, -1.0f, -1.0f, 1.0f}; + + XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v); + + XMVECTOR SinAngles, CosAngles; + XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles); + + XMVECTOR P0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR Y0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR R0 = XMVectorPermute(SinAngles, CosAngles); + XMVECTOR P1 = XMVectorPermute(CosAngles, SinAngles); + XMVECTOR Y1 = XMVectorPermute(CosAngles, SinAngles); + XMVECTOR R1 = XMVectorPermute(CosAngles, SinAngles); + + XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v); + XMVECTOR Q0 = XMVectorMultiply(P0, Y0); + Q1 = XMVectorMultiply(Q1, Y1); + Q0 = XMVectorMultiply(Q0, R0); + XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0); + + return Q; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationNormal +( + FXMVECTOR NormalAxis, + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v); + + float SinV, CosV; + XMScalarSinCos(&SinV, &CosV, 0.5f * Angle); + + XMVECTOR Scale = XMVectorSet( SinV, SinV, SinV, CosV ); + return XMVectorMultiply(N, Scale); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR N = _mm_and_ps(NormalAxis,g_XMMask3); + N = _mm_or_ps(N,g_XMIdentityR3); + XMVECTOR Scale = _mm_set_ps1(0.5f * Angle); + XMVECTOR vSine; + XMVECTOR vCosine; + XMVectorSinCos(&vSine,&vCosine,Scale); + Scale = _mm_and_ps(vSine,g_XMMask3); + vCosine = _mm_and_ps(vCosine,g_XMMaskW); + Scale = _mm_or_ps(Scale,vCosine); + N = _mm_mul_ps(N,Scale); + return N; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationAxis +( + FXMVECTOR Axis, + float Angle +) +{ + assert(!XMVector3Equal(Axis, XMVectorZero())); + assert(!XMVector3IsInfinite(Axis)); + + XMVECTOR Normal = XMVector3Normalize(Axis); + XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle); + return Q; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix +( + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 q; + float r22 = M.m[2][2]; + if (r22 <= 0.f) // x^2 + y^2 >= z^2 + w^2 + { + float dif10 = M.m[1][1] - M.m[0][0]; + float omr22 = 1.f - r22; + if (dif10 <= 0.f) // x^2 >= y^2 + { + float fourXSqr = omr22 - dif10; + float inv4x = 0.5f / sqrtf(fourXSqr); + q.f[0] = fourXSqr*inv4x; + q.f[1] = (M.m[0][1] + M.m[1][0])*inv4x; + q.f[2] = (M.m[0][2] + M.m[2][0])*inv4x; + q.f[3] = (M.m[1][2] - M.m[2][1])*inv4x; + } + else // y^2 >= x^2 + { + float fourYSqr = omr22 + dif10; + float inv4y = 0.5f / sqrtf(fourYSqr); + q.f[0] = (M.m[0][1] + M.m[1][0])*inv4y; + q.f[1] = fourYSqr*inv4y; + q.f[2] = (M.m[1][2] + M.m[2][1])*inv4y; + q.f[3] = (M.m[2][0] - M.m[0][2])*inv4y; + } + } + else // z^2 + w^2 >= x^2 + y^2 + { + float sum10 = M.m[1][1] + M.m[0][0]; + float opr22 = 1.f + r22; + if (sum10 <= 0.f) // z^2 >= w^2 + { + float fourZSqr = opr22 - sum10; + float inv4z = 0.5f / sqrtf(fourZSqr); + q.f[0] = (M.m[0][2] + M.m[2][0])*inv4z; + q.f[1] = (M.m[1][2] + M.m[2][1])*inv4z; + q.f[2] = fourZSqr*inv4z; + q.f[3] = (M.m[0][1] - M.m[1][0])*inv4z; + } + else // w^2 >= z^2 + { + float fourWSqr = opr22 + sum10; + float inv4w = 0.5f / sqrtf(fourWSqr); + q.f[0] = (M.m[1][2] - M.m[2][1])*inv4w; + q.f[1] = (M.m[2][0] - M.m[0][2])*inv4w; + q.f[2] = (M.m[0][1] - M.m[1][0])*inv4w; + q.f[3] = fourWSqr*inv4w; + } + } + return q.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 XMPMMP = {+1.0f, -1.0f, -1.0f, +1.0f}; + static const XMVECTORF32 XMMPMP = {-1.0f, +1.0f, -1.0f, +1.0f}; + static const XMVECTORF32 XMMMPP = {-1.0f, -1.0f, +1.0f, +1.0f}; + static const XMVECTORU32 Select0110 = { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 }; + static const XMVECTORU32 Select0010 = { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 }; + + XMVECTOR r0 = M.r[0]; + XMVECTOR r1 = M.r[1]; + XMVECTOR r2 = M.r[2]; + + XMVECTOR r00 = vdupq_lane_f32(vget_low_f32(r0), 0); + XMVECTOR r11 = vdupq_lane_f32(vget_low_f32(r1), 1); + XMVECTOR r22 = vdupq_lane_f32(vget_high_f32(r2), 0); + + // x^2 >= y^2 equivalent to r11 - r00 <= 0 + XMVECTOR r11mr00 = vsubq_f32(r11, r00); + XMVECTOR x2gey2 = vcleq_f32(r11mr00, g_XMZero); + + // z^2 >= w^2 equivalent to r11 + r00 <= 0 + XMVECTOR r11pr00 = vaddq_f32(r11, r00); + XMVECTOR z2gew2 = vcleq_f32(r11pr00, g_XMZero); + + // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 + XMVECTOR x2py2gez2pw2 = vcleq_f32(r22, g_XMZero); + + // (4*x^2, 4*y^2, 4*z^2, 4*w^2) + XMVECTOR t0 = vmulq_f32( XMPMMP, r00 ); + XMVECTOR x2y2z2w2 = vmlaq_f32( t0, XMMPMP, r11 ); + x2y2z2w2 = vmlaq_f32( x2y2z2w2, XMMMPP, r22 ); + x2y2z2w2 = vaddq_f32( x2y2z2w2, g_XMOne ); + + // (r01, r02, r12, r11) + t0 = vextq_f32(r0, r0, 1); + XMVECTOR t1 = vextq_f32(r1, r1, 1); + t0 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_low_f32( t1 ) ) ); + + // (r10, r20, r21, r10) + t1 = vextq_f32(r2, r2, 3); + XMVECTOR r10 = vdupq_lane_f32( vget_low_f32(r1), 0 ); + t1 = vbslq_f32( Select0110, t1, r10 ); + + // (4*x*y, 4*x*z, 4*y*z, unused) + XMVECTOR xyxzyz = vaddq_f32(t0, t1); + + // (r21, r20, r10, r10) + t0 = vcombine_f32( vrev64_f32( vget_low_f32(r2) ), vget_low_f32(r10) ); + + // (r12, r02, r01, r12) + XMVECTOR t2 = vcombine_f32( vrev64_f32( vget_high_f32(r0) ), vrev64_f32( vget_low_f32(r0) ) ); + XMVECTOR t3 = vdupq_lane_f32( vget_high_f32(r1), 0 ); + t1 = vbslq_f32( Select0110, t2, t3 ); + + // (4*x*w, 4*y*w, 4*z*w, unused) + XMVECTOR xwywzw = vsubq_f32(t0, t1); + xwywzw = vmulq_f32(XMMPMP, xwywzw); + + // (4*x*x, 4*x*y, 4*x*z, 4*x*w) + t0 = vextq_f32( xyxzyz, xyxzyz, 3 ); + t1 = vbslq_f32( Select0110, t0, x2y2z2w2 ); + t2 = vdupq_lane_f32( vget_low_f32(xwywzw), 0 ); + XMVECTOR tensor0 = vbslq_f32( g_XMSelect1110, t1, t2 ); + + // (4*y*x, 4*y*y, 4*y*z, 4*y*w) + t0 = vbslq_f32( g_XMSelect1011, xyxzyz, x2y2z2w2 ); + t1 = vdupq_lane_f32( vget_low_f32(xwywzw), 1 ); + XMVECTOR tensor1 = vbslq_f32( g_XMSelect1110, t0, t1 ); + + // (4*z*x, 4*z*y, 4*z*z, 4*z*w) + t0 = vextq_f32(xyxzyz, xyxzyz, 1); + t1 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_high_f32(xwywzw) ) ); + XMVECTOR tensor2 = vbslq_f32( Select0010, x2y2z2w2, t1 ); + + // (4*w*x, 4*w*y, 4*w*z, 4*w*w) + XMVECTOR tensor3 = vbslq_f32( g_XMSelect1110, xwywzw, x2y2z2w2 ); + + // Select the row of the tensor-product matrix that has the largest + // magnitude. + t0 = vbslq_f32( x2gey2, tensor0, tensor1 ); + t1 = vbslq_f32( z2gew2, tensor2, tensor3 ); + t2 = vbslq_f32( x2py2gez2pw2, t0, t1 ); + + // Normalize the row. No division by zero is possible because the + // quaternion is unit-length (and the row is a nonzero multiple of + // the quaternion). + t0 = XMVector4Length(t2); + return XMVectorDivide(t2, t0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 XMPMMP = {+1.0f, -1.0f, -1.0f, +1.0f}; + static const XMVECTORF32 XMMPMP = {-1.0f, +1.0f, -1.0f, +1.0f}; + static const XMVECTORF32 XMMMPP = {-1.0f, -1.0f, +1.0f, +1.0f}; + + XMVECTOR r0 = M.r[0]; // (r00, r01, r02, 0) + XMVECTOR r1 = M.r[1]; // (r10, r11, r12, 0) + XMVECTOR r2 = M.r[2]; // (r20, r21, r22, 0) + + // (r00, r00, r00, r00) + XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0,0,0,0)); + // (r11, r11, r11, r11) + XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1,1,1,1)); + // (r22, r22, r22, r22) + XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2,2,2,2)); + + // x^2 >= y^2 equivalent to r11 - r00 <= 0 + // (r11 - r00, r11 - r00, r11 - r00, r11 - r00) + XMVECTOR r11mr00 = _mm_sub_ps(r11, r00); + XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero); + + // z^2 >= w^2 equivalent to r11 + r00 <= 0 + // (r11 + r00, r11 + r00, r11 + r00, r11 + r00) + XMVECTOR r11pr00 = _mm_add_ps(r11, r00); + XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero); + + // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 + XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero); + + // (+r00, -r00, -r00, +r00) + XMVECTOR t0 = _mm_mul_ps(XMPMMP, r00); + + // (-r11, +r11, -r11, +r11) + XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11); + + // (-r22, -r22, +r22, +r22) + XMVECTOR t2 = _mm_mul_ps(XMMMPP, r22); + + // (4*x^2, 4*y^2, 4*z^2, 4*w^2) + XMVECTOR x2y2z2w2 = _mm_add_ps(t0, t1); + x2y2z2w2 = _mm_add_ps(t2, x2y2z2w2); + x2y2z2w2 = _mm_add_ps(x2y2z2w2, g_XMOne); + + // (r01, r02, r12, r11) + t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1,2,2,1)); + // (r10, r10, r20, r21) + t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1,0,0,0)); + // (r10, r20, r21, r10) + t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0)); + // (4*x*y, 4*x*z, 4*y*z, unused) + XMVECTOR xyxzyz = _mm_add_ps(t0, t1); + + // (r21, r20, r10, r10) + t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0,0,0,1)); + // (r12, r12, r02, r01) + t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1,2,2,2)); + // (r12, r02, r01, r12) + t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0)); + // (4*x*w, 4*y*w, 4*z*w, unused) + XMVECTOR xwywzw = _mm_sub_ps(t0, t1); + xwywzw = _mm_mul_ps(XMMPMP, xwywzw); + + // (4*x^2, 4*y^2, 4*x*y, unused) + t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0,0,1,0)); + // (4*z^2, 4*w^2, 4*z*w, unused) + t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0,2,3,2)); + // (4*x*z, 4*y*z, 4*x*w, 4*y*w) + t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1,0,2,1)); + + // (4*x*x, 4*x*y, 4*x*z, 4*x*w) + XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,2,0)); + // (4*y*x, 4*y*y, 4*y*z, 4*y*w) + XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,1,1,2)); + // (4*z*x, 4*z*y, 4*z*z, 4*z*w) + XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,0,1,0)); + // (4*w*x, 4*w*y, 4*w*z, 4*w*w) + XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1,2,3,2)); + + // Select the row of the tensor-product matrix that has the largest + // magnitude. + t0 = _mm_and_ps(x2gey2, tensor0); + t1 = _mm_andnot_ps(x2gey2, tensor1); + t0 = _mm_or_ps(t0, t1); + t1 = _mm_and_ps(z2gew2, tensor2); + t2 = _mm_andnot_ps(z2gew2, tensor3); + t1 = _mm_or_ps(t1, t2); + t0 = _mm_and_ps(x2py2gez2pw2, t0); + t1 = _mm_andnot_ps(x2py2gez2pw2, t1); + t2 = _mm_or_ps(t0, t1); + + // Normalize the row. No division by zero is possible because the + // quaternion is unit-length (and the row is a nonzero multiple of + // the quaternion). + t0 = XMVector4Length(t2); + return _mm_div_ps(t2, t0); +#endif +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMQuaternionToAxisAngle +( + XMVECTOR* pAxis, + float* pAngle, + FXMVECTOR Q +) +{ + assert(pAxis); + assert(pAngle); + + *pAxis = Q; + + *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q)); +} + +/**************************************************************************** + * + * Plane + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) +{ + return XMVector4Equal(P1, P2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneNearEqual +( + FXMVECTOR P1, + FXMVECTOR P2, + FXMVECTOR Epsilon +) +{ + XMVECTOR NP1 = XMPlaneNormalize(P1); + XMVECTOR NP2 = XMPlaneNormalize(P2); + return XMVector4NearEqual(NP1, NP2, Epsilon); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneNotEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) +{ + return XMVector4NotEqual(P1, P2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneIsNaN +( + FXMVECTOR P +) +{ + return XMVector4IsNaN(P); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMPlaneIsInfinite +( + FXMVECTOR P +) +{ + return XMVector4IsInfinite(P); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDot +( + FXMVECTOR P, + FXMVECTOR V +) +{ + return XMVector4Dot(P, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDotCoord +( + FXMVECTOR P, + FXMVECTOR V +) +{ + // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3] + + XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v); + XMVECTOR Result = XMVector4Dot(P, V3); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneDotNormal +( + FXMVECTOR P, + FXMVECTOR V +) +{ + return XMVector3Dot(P, V); +} + +//------------------------------------------------------------------------------ +// XMPlaneNormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst +( + FXMVECTOR P +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Result = XMVector3ReciprocalLengthEst(P); + return XMVectorMultiply(P, Result); + +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, P); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(P,P); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_mul_ps(vDot,P); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneNormalize +( + FXMVECTOR P +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fLengthSq = sqrtf((P.vector4_f32[0]*P.vector4_f32[0])+(P.vector4_f32[1]*P.vector4_f32[1])+(P.vector4_f32[2]*P.vector4_f32[2])); + // Prevent divide by zero + if (fLengthSq) { + fLengthSq = 1.0f/fLengthSq; + } + { + XMVECTOR vResult = { + P.vector4_f32[0]*fLengthSq, + P.vector4_f32[1]*fLengthSq, + P.vector4_f32[2]*fLengthSq, + P.vector4_f32[3]*fLengthSq + }; + return vResult; + } +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vLength = XMVector3ReciprocalLength(P); + return XMVectorMultiply( P, vLength ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vLengthSq); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(P,P); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vLengthSq); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneIntersectLine +( + FXMVECTOR P, + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2 +) +{ + XMVECTOR V1 = XMVector3Dot(P, LinePoint1); + XMVECTOR V2 = XMVector3Dot(P, LinePoint2); + XMVECTOR D = XMVectorSubtract(V1, V2); + + XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1); + VT = XMVectorDivide(VT, D); + + XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1); + Point = XMVectorMultiplyAdd(Point, VT, LinePoint1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v); + + return XMVectorSelect(Point, g_XMQNaN.v, Control); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV XMPlaneIntersectPlane +( + XMVECTOR* pLinePoint1, + XMVECTOR* pLinePoint2, + FXMVECTOR P1, + FXMVECTOR P2 +) +{ + assert(pLinePoint1); + assert(pLinePoint2); + + XMVECTOR V1 = XMVector3Cross(P2, P1); + + XMVECTOR LengthSq = XMVector3LengthSq(V1); + + XMVECTOR V2 = XMVector3Cross(P2, V1); + + XMVECTOR P1W = XMVectorSplatW(P1); + XMVECTOR Point = XMVectorMultiply(V2, P1W); + + XMVECTOR V3 = XMVector3Cross(V1, P1); + + XMVECTOR P2W = XMVectorSplatW(P2); + Point = XMVectorMultiplyAdd(V3, P2W, Point); + + XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq); + + XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1); + + XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v); + *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN.v, Control); + *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN.v, Control); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneTransform +( + FXMVECTOR P, + FXMMATRIX M +) +{ + XMVECTOR W = XMVectorSplatW(P); + XMVECTOR Z = XMVectorSplatZ(P); + XMVECTOR Y = XMVectorSplatY(P); + XMVECTOR X = XMVectorSplatX(P); + + XMVECTOR Result = XMVectorMultiply(W, M.r[3]); + Result = XMVectorMultiplyAdd(Z, M.r[2], Result); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + return Result; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMPlaneTransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT4* pInputStream, + size_t InputStride, + size_t PlaneCount, + FXMMATRIX M +) +{ + return XMVector4TransformStream(pOutputStream, + OutputStride, + pInputStream, + InputStride, + PlaneCount, + M); +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneFromPointNormal +( + FXMVECTOR Point, + FXMVECTOR Normal +) +{ + XMVECTOR W = XMVector3Dot(Point, Normal); + W = XMVectorNegate(W); + return XMVectorSelect(W, Normal, g_XMSelect1110.v); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMPlaneFromPoints +( + FXMVECTOR Point1, + FXMVECTOR Point2, + FXMVECTOR Point3 +) +{ + XMVECTOR V21 = XMVectorSubtract(Point1, Point2); + XMVECTOR V31 = XMVectorSubtract(Point1, Point3); + + XMVECTOR N = XMVector3Cross(V21, V31); + N = XMVector3Normalize(N); + + XMVECTOR D = XMPlaneDotNormal(N, Point1); + D = XMVectorNegate(D); + + XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v); + + return Result; +} + +/**************************************************************************** + * + * Color + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4Equal(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorNotEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4NotEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorGreater +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4Greater(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorGreaterOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4GreaterOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorLess +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4Less(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorLessOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4LessOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorIsNaN +( + FXMVECTOR C +) +{ + return XMVector4IsNaN(C); +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMColorIsInfinite +( + FXMVECTOR C +) +{ + return XMVector4IsInfinite(C); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorNegative +( + FXMVECTOR vColor +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + 1.0f - vColor.vector4_f32[0], + 1.0f - vColor.vector4_f32[1], + 1.0f - vColor.vector4_f32[2], + vColor.vector4_f32[3] + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vTemp = veorq_u32(vColor,g_XMNegate3); + return vaddq_f32(vTemp,g_XMOne3); +#elif defined(_XM_SSE_INTRINSICS_) + // Negate only x,y and z. + XMVECTOR vTemp = _mm_xor_ps(vColor,g_XMNegate3); + // Add 1,1,1,0 to -x,-y,-z,w + return _mm_add_ps(vTemp,g_XMOne3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorModulate +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVectorMultiply(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorAdjustSaturation +( + FXMVECTOR vColor, + float fSaturation +) +{ + // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2]; + // Result = (C - Luminance) * Saturation + Luminance; + +#if defined(_XM_NO_INTRINSICS_) + const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f}; + + float fLuminance = (vColor.vector4_f32[0]*gvLuminance.f[0])+(vColor.vector4_f32[1]*gvLuminance.f[1])+(vColor.vector4_f32[2]*gvLuminance.f[2]); + XMVECTOR vResult; + vResult.vector4_f32[0] = ((vColor.vector4_f32[0] - fLuminance)*fSaturation)+fLuminance; + vResult.vector4_f32[1] = ((vColor.vector4_f32[1] - fLuminance)*fSaturation)+fLuminance; + vResult.vector4_f32[2] = ((vColor.vector4_f32[2] - fLuminance)*fSaturation)+fLuminance; + vResult.vector4_f32[3] = vColor.vector4_f32[3]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f}; + XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance ); + XMVECTOR vResult = vsubq_f32(vColor, vLuminance); + vResult = vmlaq_n_f32( vLuminance, vResult, fSaturation ); + return vbslq_f32( g_XMSelect1110, vResult, vColor ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f}; + XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance ); +// Splat fSaturation + XMVECTOR vSaturation = _mm_set_ps1(fSaturation); +// vResult = ((vColor-vLuminance)*vSaturation)+vLuminance; + XMVECTOR vResult = _mm_sub_ps(vColor,vLuminance); + vResult = _mm_mul_ps(vResult,vSaturation); + vResult = _mm_add_ps(vResult,vLuminance); +// Retain w from the source color + vLuminance = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult,vLuminance,_MM_SHUFFLE(3,0,1,0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorAdjustContrast +( + FXMVECTOR vColor, + float fContrast +) +{ + // Result = (vColor - 0.5f) * fContrast + 0.5f; + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + ((vColor.vector4_f32[0]-0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[1]-0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[2]-0.5f) * fContrast) + 0.5f, + vColor.vector4_f32[3] // Leave W untouched + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v); + vResult = vmlaq_n_f32( g_XMOneHalf.v, vResult, fContrast ); + return vbslq_f32( g_XMSelect1110, vResult, vColor ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vScale = _mm_set_ps1(fContrast); // Splat the scale + XMVECTOR vResult = _mm_sub_ps(vColor,g_XMOneHalf); // Subtract 0.5f from the source (Saving source) + vResult = _mm_mul_ps(vResult,vScale); // Mul by scale + vResult = _mm_add_ps(vResult,g_XMOneHalf); // Add 0.5f +// Retain w from the source color + vScale = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult,vScale,_MM_SHUFFLE(3,0,1,0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToHSL( FXMVECTOR rgb ) +{ + XMVECTOR r = XMVectorSplatX( rgb ); + XMVECTOR g = XMVectorSplatY( rgb ); + XMVECTOR b = XMVectorSplatZ( rgb ); + + XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) ); + XMVECTOR max = XMVectorMax( r, XMVectorMax( g, b ) ); + + XMVECTOR l = XMVectorMultiply( XMVectorAdd( min, max ), g_XMOneHalf ); + + XMVECTOR d = XMVectorSubtract( max, min ); + + XMVECTOR la = XMVectorSelect( rgb, l, g_XMSelect1110 ); + + if ( XMVector3Less( d, g_XMEpsilon ) ) + { + // Achromatic, assume H and S of 0 + return XMVectorSelect( la, g_XMZero, g_XMSelect1100 ); + } + else + { + XMVECTOR s, h; + + XMVECTOR d2 = XMVectorAdd( min, max ); + + if ( XMVector3Greater( l, g_XMOneHalf ) ) + { + // d / (2-max-min) + s = XMVectorDivide( d, XMVectorSubtract( g_XMTwo, d2 ) ); + } + else + { + // d / (max+min) + s = XMVectorDivide( d, d2 ); + } + + if ( XMVector3Equal( r, max ) ) + { + // Red is max + h = XMVectorDivide( XMVectorSubtract( g, b ), d ); + } + else if ( XMVector3Equal( g, max ) ) + { + // Green is max + h = XMVectorDivide( XMVectorSubtract( b, r ), d ); + h = XMVectorAdd( h, g_XMTwo ); + } + else + { + // Blue is max + h = XMVectorDivide( XMVectorSubtract( r, g ), d ); + h = XMVectorAdd( h, g_XMFour ); + } + + h = XMVectorDivide( h, g_XMSix ); + + if ( XMVector3Less( h, g_XMZero ) ) + h = XMVectorAdd( h, g_XMOne ); + + XMVECTOR lha = XMVectorSelect( la, h, g_XMSelect1100 ); + return XMVectorSelect( s, lha, g_XMSelect1011 ); + } +} + +//------------------------------------------------------------------------------ + +namespace Internal +{ + +inline XMVECTOR XM_CALLCONV XMColorHue2Clr( FXMVECTOR p, FXMVECTOR q, FXMVECTOR h ) +{ + static const XMVECTORF32 oneSixth = { 1.0f/6.0f, 1.0f/6.0f, 1.0f/6.0f, 1.0f/6.0f }; + static const XMVECTORF32 twoThirds = { 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f }; + + XMVECTOR t = h; + + if ( XMVector3Less( t, g_XMZero ) ) + t = XMVectorAdd( t, g_XMOne ); + + if ( XMVector3Greater( t, g_XMOne ) ) + t = XMVectorSubtract( t, g_XMOne ); + + if ( XMVector3Less( t, oneSixth ) ) + { + // p + (q - p) * 6 * t + XMVECTOR t1 = XMVectorSubtract( q, p ); + XMVECTOR t2 = XMVectorMultiply( g_XMSix, t ); + return XMVectorMultiplyAdd( t1, t2, p ); + } + + if ( XMVector3Less( t, g_XMOneHalf ) ) + return q; + + if ( XMVector3Less( t, twoThirds ) ) + { + // p + (q - p) * 6 * (2/3 - t) + XMVECTOR t1 = XMVectorSubtract( q, p ); + XMVECTOR t2 = XMVectorMultiply( g_XMSix, XMVectorSubtract( twoThirds, t ) ); + return XMVectorMultiplyAdd( t1, t2, p ); + } + + return p; +} + +}; // namespace Internal + +inline XMVECTOR XM_CALLCONV XMColorHSLToRGB( FXMVECTOR hsl ) +{ + static const XMVECTORF32 oneThird = { 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f }; + + XMVECTOR s = XMVectorSplatY( hsl ); + XMVECTOR l = XMVectorSplatZ( hsl ); + + if ( XMVector3NearEqual( s, g_XMZero, g_XMEpsilon ) ) + { + // Achromatic + return XMVectorSelect( hsl, l, g_XMSelect1110 ); + } + else + { + XMVECTOR h = XMVectorSplatX( hsl ); + + XMVECTOR q; + if ( XMVector3Less( l, g_XMOneHalf ) ) + { + q = XMVectorMultiply( l, XMVectorAdd ( g_XMOne, s ) ); + } + else + { + q = XMVectorSubtract( XMVectorAdd( l, s ), XMVectorMultiply( l, s ) ); + } + + XMVECTOR p = XMVectorSubtract( XMVectorMultiply( g_XMTwo, l ), q ); + + XMVECTOR r = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorAdd( h, oneThird ) ); + XMVECTOR g = DirectX::Internal::XMColorHue2Clr( p, q, h ); + XMVECTOR b = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorSubtract( h, oneThird ) ); + + XMVECTOR rg = XMVectorSelect( g, r, g_XMSelect1000 ); + XMVECTOR ba = XMVectorSelect( hsl, b, g_XMSelect1110 ); + + return XMVectorSelect( ba, rg, g_XMSelect1100 ); + } +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToHSV( FXMVECTOR rgb ) +{ + XMVECTOR r = XMVectorSplatX( rgb ); + XMVECTOR g = XMVectorSplatY( rgb ); + XMVECTOR b = XMVectorSplatZ( rgb ); + + XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) ); + XMVECTOR v = XMVectorMax( r, XMVectorMax( g, b ) ); + + XMVECTOR d = XMVectorSubtract( v, min ); + + XMVECTOR s = ( XMVector3NearEqual( v, g_XMZero, g_XMEpsilon ) ) ? g_XMZero : XMVectorDivide( d, v ); + + if ( XMVector3Less( d, g_XMEpsilon ) ) + { + // Achromatic, assume H of 0 + XMVECTOR hv = XMVectorSelect( v, g_XMZero, g_XMSelect1000 ); + XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 ); + return XMVectorSelect( s, hva, g_XMSelect1011 ); + } + else + { + XMVECTOR h; + + if ( XMVector3Equal( r, v ) ) + { + // Red is max + h = XMVectorDivide( XMVectorSubtract( g, b ), d ); + + if ( XMVector3Less( g, b ) ) + h = XMVectorAdd( h, g_XMSix ); + } + else if ( XMVector3Equal( g, v ) ) + { + // Green is max + h = XMVectorDivide( XMVectorSubtract( b, r ), d ); + h = XMVectorAdd( h, g_XMTwo ); + } + else + { + // Blue is max + h = XMVectorDivide( XMVectorSubtract( r, g ), d ); + h = XMVectorAdd( h, g_XMFour ); + } + + h = XMVectorDivide( h, g_XMSix ); + + XMVECTOR hv = XMVectorSelect( v, h, g_XMSelect1000 ); + XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 ); + return XMVectorSelect( s, hva, g_XMSelect1011 ); + } +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorHSVToRGB( FXMVECTOR hsv ) +{ + XMVECTOR h = XMVectorSplatX( hsv ); + XMVECTOR s = XMVectorSplatY( hsv ); + XMVECTOR v = XMVectorSplatZ( hsv ); + + XMVECTOR h6 = XMVectorMultiply( h, g_XMSix ); + + XMVECTOR i = XMVectorFloor( h6 ); + XMVECTOR f = XMVectorSubtract( h6, i ); + + // p = v* (1-s) + XMVECTOR p = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, s ) ); + + // q = v*(1-f*s) + XMVECTOR q = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( f, s ) ) ); + + // t = v*(1 - (1-f)*s) + XMVECTOR t = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( XMVectorSubtract( g_XMOne, f ), s ) ) ); + + int ii = static_cast( XMVectorGetX( XMVectorMod( i, g_XMSix ) ) ); + + XMVECTOR _rgb; + + switch (ii) + { + case 0: // rgb = vtp + { + XMVECTOR vt = XMVectorSelect( t, v, g_XMSelect1000 ); + _rgb = XMVectorSelect( p, vt, g_XMSelect1100 ); + } + break; + case 1: // rgb = qvp + { + XMVECTOR qv = XMVectorSelect( v, q, g_XMSelect1000 ); + _rgb = XMVectorSelect( p, qv, g_XMSelect1100 ); + } + break; + case 2: // rgb = pvt + { + XMVECTOR pv = XMVectorSelect( v, p, g_XMSelect1000 ); + _rgb = XMVectorSelect( t, pv, g_XMSelect1100 ); + } + break; + case 3: // rgb = pqv + { + XMVECTOR pq = XMVectorSelect( q, p, g_XMSelect1000 ); + _rgb = XMVectorSelect( v, pq, g_XMSelect1100 ); + } + break; + case 4: // rgb = tpv + { + XMVECTOR tp = XMVectorSelect( p, t, g_XMSelect1000 ); + _rgb = XMVectorSelect( v, tp, g_XMSelect1100 ); + } + break; + default: // rgb = vpq + { + XMVECTOR vp = XMVectorSelect( p, v, g_XMSelect1000 ); + _rgb = XMVectorSelect( q, vp, g_XMSelect1100 ); + } + break; + } + + return XMVectorSelect( hsv, _rgb, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV( FXMVECTOR rgb ) +{ + static const XMVECTORF32 Scale0 = { 0.299f, -0.147f, 0.615f, 0.0f }; + static const XMVECTORF32 Scale1 = { 0.587f, -0.289f, -0.515f, 0.0f }; + static const XMVECTORF32 Scale2 = { 0.114f, 0.436f, -0.100f, 0.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( rgb, M ); + + return XMVectorSelect( rgb, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB( FXMVECTOR yuv ) +{ + static const XMVECTORF32 Scale1 = { 0.0f, -0.395f, 2.032f, 0.0f }; + static const XMVECTORF32 Scale2 = { 1.140f, -0.581f, 0.0f, 0.0f }; + + XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( yuv, M ); + + return XMVectorSelect( yuv, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD( FXMVECTOR rgb ) +{ + static const XMVECTORF32 Scale0 = { 0.2126f, -0.0997f, 0.6150f, 0.0f }; + static const XMVECTORF32 Scale1 = { 0.7152f, -0.3354f, -0.5586f, 0.0f }; + static const XMVECTORF32 Scale2 = { 0.0722f, 0.4351f, -0.0564f, 0.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( rgb, M ); + + return XMVectorSelect( rgb, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD( FXMVECTOR yuv ) +{ + static const XMVECTORF32 Scale1 = { 0.0f, -0.2153f, 2.1324f, 0.0f }; + static const XMVECTORF32 Scale2 = { 1.2803f, -0.3806f, 0.0f, 0.0f }; + + XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( yuv, M ); + + return XMVectorSelect( yuv, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToXYZ( FXMVECTOR rgb ) +{ + static const XMVECTORF32 Scale0 = { 0.4887180f, 0.1762044f, 0.0000000f, 0.0f }; + static const XMVECTORF32 Scale1 = { 0.3106803f, 0.8129847f, 0.0102048f, 0.0f }; + static const XMVECTORF32 Scale2 = { 0.2006017f, 0.0108109f, 0.9897952f, 0.0f }; + static const XMVECTORF32 Scale = { 1.f/0.17697f, 1.f/0.17697f, 1.f/0.17697f, 0.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVectorMultiply( XMVector3Transform( rgb, M ), Scale ); + + return XMVectorSelect( rgb, clr, g_XMSelect1110 ); +} + +inline XMVECTOR XM_CALLCONV XMColorXYZToRGB( FXMVECTOR xyz ) +{ + static const XMVECTORF32 Scale0 = { 2.3706743f, -0.5138850f, 0.0052982f, 0.0f }; + static const XMVECTORF32 Scale1 = { -0.9000405f, 1.4253036f, -0.0146949f, 0.0f }; + static const XMVECTORF32 Scale2 = { -0.4706338f, 0.0885814f, 1.0093968f, 0.0f }; + static const XMVECTORF32 Scale = { 0.17697f, 0.17697f, 0.17697f, 0.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( XMVectorMultiply( xyz, Scale ), M ); + + return XMVectorSelect( xyz, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorXYZToSRGB( FXMVECTOR xyz ) +{ + static const XMVECTORF32 Scale0 = { 3.2406f, -0.9689f, 0.0557f, 0.0f }; + static const XMVECTORF32 Scale1 = { -1.5372f, 1.8758f, -0.2040f, 0.0f }; + static const XMVECTORF32 Scale2 = { -0.4986f, 0.0415f, 1.0570f, 0.0f }; + static const XMVECTORF32 Cutoff = { 0.0031308f, 0.0031308f, 0.0031308f, 0.0f }; + static const XMVECTORF32 Exp = { 1.0f/2.4f, 1.0f/2.4f, 1.0f/2.4f, 1.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR lclr = XMVector3Transform( xyz, M ); + + XMVECTOR sel = XMVectorGreater( lclr, Cutoff ); + + // clr = 12.92 * lclr for lclr <= 0.0031308f + XMVECTOR smallC = XMVectorMultiply( lclr, g_XMsrgbScale ); + + // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055) + XMVECTOR largeC = XMVectorSubtract( XMVectorMultiply( g_XMsrgbA1, XMVectorPow( lclr, Exp ) ), g_XMsrgbA ); + + XMVECTOR clr = XMVectorSelect( smallC, largeC, sel ); + + return XMVectorSelect( xyz, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorSRGBToXYZ( FXMVECTOR srgb ) +{ + static const XMVECTORF32 Scale0 = { 0.4124f, 0.2126f, 0.0193f, 0.0f }; + static const XMVECTORF32 Scale1 = { 0.3576f, 0.7152f, 0.1192f, 0.0f }; + static const XMVECTORF32 Scale2 = { 0.1805f, 0.0722f, 0.9505f, 0.0f }; + static const XMVECTORF32 Cutoff = { 0.04045f, 0.04045f, 0.04045f, 0.0f }; + static const XMVECTORF32 Exp = { 2.4f, 2.4f, 2.4f, 1.0f }; + + XMVECTOR sel = XMVectorGreater( srgb, Cutoff ); + + // lclr = clr / 12.92 + XMVECTOR smallC = XMVectorDivide( srgb, g_XMsrgbScale ); + + // lclr = pow( (clr + a) / (1+a), 2.4 ) + XMVECTOR largeC = XMVectorPow( XMVectorDivide( XMVectorAdd( srgb, g_XMsrgbA ), g_XMsrgbA1 ), Exp ); + + XMVECTOR lclr = XMVectorSelect( smallC, largeC, sel ); + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( lclr, M ); + + return XMVectorSelect( srgb, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorRGBToSRGB( FXMVECTOR rgb ) +{ + static const XMVECTORF32 Cutoff = { 0.0031308f, 0.0031308f, 0.0031308f, 1.f }; + static const XMVECTORF32 Linear = { 12.92f, 12.92f, 12.92f, 1.f }; + static const XMVECTORF32 Scale = { 1.055f, 1.055f, 1.055f, 1.f }; + static const XMVECTORF32 Bias = { 0.055f, 0.055f, 0.055f, 0.f }; + static const XMVECTORF32 InvGamma = { 1.0f/2.4f, 1.0f/2.4f, 1.0f/2.4f, 1.f }; + + XMVECTOR V = XMVectorSaturate(rgb); + XMVECTOR V0 = XMVectorMultiply( V, Linear ); + XMVECTOR V1 = Scale * XMVectorPow( V, InvGamma ) - Bias; + XMVECTOR select = XMVectorLess( V, Cutoff ); + V = XMVectorSelect( V1, V0, select ); + return XMVectorSelect( rgb, V, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB( FXMVECTOR srgb ) +{ + static const XMVECTORF32 Cutoff = { 0.04045f, 0.04045f, 0.04045f, 1.f }; + static const XMVECTORF32 ILinear = { 1.f/12.92f, 1.f/12.92f, 1.f/12.92f, 1.f }; + static const XMVECTORF32 Scale = { 1.f/1.055f, 1.f/1.055f, 1.f/1.055f, 1.f }; + static const XMVECTORF32 Bias = { 0.055f, 0.055f, 0.055f, 0.f }; + static const XMVECTORF32 Gamma = { 2.4f, 2.4f, 2.4f, 1.f }; + + XMVECTOR V = XMVectorSaturate(srgb); + XMVECTOR V0 = XMVectorMultiply( V, ILinear ); + XMVECTOR V1 = XMVectorPow( (V + Bias) * Scale, Gamma ); + XMVECTOR select = XMVectorGreater( V, Cutoff ); + V = XMVectorSelect( V0, V1, select ); + return XMVectorSelect( srgb, V, g_XMSelect1110 ); +} + +/**************************************************************************** + * + * Miscellaneous + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline bool XMVerifyCPUSupport() +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + int CPUInfo[4] = { -1 }; + __cpuid(CPUInfo, 0); + +#ifdef __AVX2__ + if (CPUInfo[0] < 7) + return false; +#else + if (CPUInfo[0] < 1) + return false; +#endif + + __cpuid(CPUInfo, 1); + +#ifdef __AVX2__ + // The compiler can emit FMA3 instructions even without explicit intrinsics use + if ((CPUInfo[2] & 0x38081001) != 0x38081001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support +#elif defined(_XM_F16C_INTRINSICS_) + if ((CPUInfo[2] & 0x38080001) != 0x38080001) + return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support +#elif defined(__AVX__) || defined(_XM_AVX_INTRINSICS_) + if ((CPUInfo[2] & 0x18080001) != 0x18080001) + return false; // No AVX/OSXSAVE/SSE4.1/SSE3 support +#elif defined(_XM_SSE4_INTRINSICS_) + if ((CPUInfo[2] & 0x80001) != 0x80001) + return false; // No SSE3/SSE4.1 support +#elif defined(_XM_SSE3_INTRINSICS_) + if (!(CPUInfo[2] & 0x1)) + return false; // No SSE3 support +#endif + + // The x64 processor model requires SSE2 support, but no harm in checking + if ((CPUInfo[3] & 0x6000000) != 0x6000000) + return false; // No SSE2/SSE support + +#ifdef __AVX2__ + __cpuidex(CPUInfo, 7, 0); + if (!(CPUInfo[1] & 0x20)) + return false; // No AVX2 support +#endif + + return true; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + // ARM-NEON support is required for the Windows on ARM platform + return true; +#else + // No intrinsics path always supported + return true; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMFresnelTerm +( + FXMVECTOR CosIncidentAngle, + FXMVECTOR RefractionIndex +) +{ + assert(!XMVector4IsInfinite(CosIncidentAngle)); + + // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where + // c = CosIncidentAngle + // g = sqrt(c^2 + RefractionIndex^2 - 1) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v); + G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G); + G = XMVectorAbs(G); + G = XMVectorSqrt(G); + + XMVECTOR S = XMVectorAdd(G, CosIncidentAngle); + XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle); + + XMVECTOR V0 = XMVectorMultiply(D, D); + XMVECTOR V1 = XMVectorMultiply(S, S); + V1 = XMVectorReciprocal(V1); + V0 = XMVectorMultiply(g_XMOneHalf.v, V0); + V0 = XMVectorMultiply(V0, V1); + + XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v); + XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v); + V2 = XMVectorMultiply(V2, V2); + V3 = XMVectorMultiply(V3, V3); + V3 = XMVectorReciprocal(V3); + V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v); + + XMVECTOR Result = XMVectorMultiply(V0, V2); + + Result = XMVectorSaturate(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2)) + XMVECTOR G = _mm_mul_ps(RefractionIndex,RefractionIndex); + XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle,CosIncidentAngle); + G = _mm_sub_ps(G,g_XMOne); + vTemp = _mm_add_ps(vTemp,G); + // max((0-vTemp),vTemp) == abs(vTemp) + // The abs is needed to deal with refraction and cosine being zero + G = _mm_setzero_ps(); + G = _mm_sub_ps(G,vTemp); + G = _mm_max_ps(G,vTemp); + // Last operation, the sqrt() + G = _mm_sqrt_ps(G); + + // Calc G-C and G+C + XMVECTOR GAddC = _mm_add_ps(G,CosIncidentAngle); + XMVECTOR GSubC = _mm_sub_ps(G,CosIncidentAngle); + // Perform the term (0.5f *(g - c)^2) / (g + c)^2 + XMVECTOR vResult = _mm_mul_ps(GSubC,GSubC); + vTemp = _mm_mul_ps(GAddC,GAddC); + vResult = _mm_mul_ps(vResult,g_XMOneHalf); + vResult = _mm_div_ps(vResult,vTemp); + // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) + GAddC = _mm_mul_ps(GAddC,CosIncidentAngle); + GSubC = _mm_mul_ps(GSubC,CosIncidentAngle); + GAddC = _mm_sub_ps(GAddC,g_XMOne); + GSubC = _mm_add_ps(GSubC,g_XMOne); + GAddC = _mm_mul_ps(GAddC,GAddC); + GSubC = _mm_mul_ps(GSubC,GSubC); + GAddC = _mm_div_ps(GAddC,GSubC); + GAddC = _mm_add_ps(GAddC,g_XMOne); + // Multiply the two term parts + vResult = _mm_mul_ps(vResult,GAddC); + // Clamp to 0.0 - 1.0f + vResult = _mm_max_ps(vResult,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMScalarNearEqual +( + float S1, + float S2, + float Epsilon +) +{ + float Delta = S1 - S2; + return (fabsf(Delta) <= Epsilon); +} + +//------------------------------------------------------------------------------ +// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI +inline float XMScalarModAngle +( + float Angle +) +{ + // Note: The modulo is performed with unsigned math only to work + // around a precision error on numbers that are close to PI + + // Normalize the range from 0.0f to XM_2PI + Angle = Angle + XM_PI; + // Perform the modulo, unsigned + float fTemp = fabsf(Angle); + fTemp = fTemp - (XM_2PI * (float)((int32_t)(fTemp/XM_2PI))); + // Restore the number to the range of -XM_PI to XM_PI-epsilon + fTemp = fTemp - XM_PI; + // If the modulo'd value was negative, restore negation + if (Angle<0.0f) { + fTemp = -fTemp; + } + return fTemp; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarSin +( + float Value +) +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = (float)((int)(quotient + 0.5f)); + } + else + { + quotient = (float)((int)(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + if (y > XM_PIDIV2) + { + y = XM_PI - y; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + } + + // 11-degree minimax approximation + float y2 = y * y; + return ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarSinEst +( + float Value +) +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = (float)((int)(quotient + 0.5f)); + } + else + { + quotient = (float)((int)(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + if (y > XM_PIDIV2) + { + y = XM_PI - y; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + } + + // 7-degree minimax approximation + float y2 = y * y; + return ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarCos +( + float Value +) +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = (float)((int)(quotient + 0.5f)); + } + else + { + quotient = (float)((int)(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + // 10-degree minimax approximation + float y2 = y*y; + float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f; + return sign*p; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarCosEst +( + float Value +) +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = (float)((int)(quotient + 0.5f)); + } + else + { + quotient = (float)((int)(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + // 6-degree minimax approximation + float y2 = y * y; + float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f; + return sign*p; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMScalarSinCos +( + float* pSin, + float* pCos, + float Value +) +{ + assert(pSin); + assert(pCos); + + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = (float)((int)(quotient + 0.5f)); + } + else + { + quotient = (float)((int)(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + float y2 = y * y; + + // 11-degree minimax approximation + *pSin = ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y; + + // 10-degree minimax approximation + float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f; + *pCos = sign*p; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMScalarSinCosEst +( + float* pSin, + float* pCos, + float Value +) +{ + assert(pSin); + assert(pCos); + + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = (float)((int)(quotient + 0.5f)); + } + else + { + quotient = (float)((int)(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + float y2 = y * y; + + // 7-degree minimax approximation + *pSin = ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y; + + // 6-degree minimax approximation + float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f; + *pCos = sign*p; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarASin +( + float Value +) +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 7-degree minimax approximation + float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f; + result *= root; // acos(|x|) + + // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) + return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarASinEst +( + float Value +) +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 3-degree minimax approximation + float result = ((-0.0187293f*x+0.0742610f)*x-0.2121144f)*x+1.5707288f; + result *= root; // acos(|x|) + + // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) + return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarACos +( + float Value +) +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 7-degree minimax approximation + float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f; + result *= root; + + // acos(x) = pi - acos(-x) when x < 0 + return (nonnegative ? result : XM_PI - result); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarACosEst +( + float Value +) +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 3-degree minimax approximation + float result = ( ( -0.0187293f * x + 0.0742610f ) * x - 0.2121144f ) * x + 1.5707288f; + result *= root; + + // acos(x) = pi - acos(-x) when x < 0 + return (nonnegative ? result : XM_PI - result); +} + diff --git a/Inc/DirectXMathVector.inl b/Inc/DirectXMathVector.inl index 53a7c4e..d417c7c 100644 --- a/Inc/DirectXMathVector.inl +++ b/Inc/DirectXMathVector.inl @@ -1,14453 +1,14453 @@ -//------------------------------------------------------------------------------------- -// DirectXMathVector.inl -- SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#pragma once - -#if defined(_XM_NO_INTRINSICS_) -#define XMISNAN(x) ((*(uint32_t*)&(x) & 0x7F800000) == 0x7F800000 && (*(uint32_t*)&(x) & 0x7FFFFF) != 0) -#define XMISINF(x) ((*(uint32_t*)&(x) & 0x7FFFFFFF) == 0x7F800000) -#endif - -#if defined(_XM_SSE_INTRINSICS_) - -#define XM3UNPACK3INTO4(l1,l2,l3) \ - XMVECTOR V3 = _mm_shuffle_ps(l2,l3,_MM_SHUFFLE(0,0,3,2));\ - XMVECTOR V2 = _mm_shuffle_ps(l2,l1,_MM_SHUFFLE(3,3,1,0));\ - V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,1,0,2));\ - XMVECTOR V4 = _mm_castsi128_ps( _mm_srli_si128(_mm_castps_si128(L3),32/8) ); - -#define XM3PACK4INTO3(v2x) \ - v2x = _mm_shuffle_ps(V2,V3,_MM_SHUFFLE(1,0,2,1));\ - V2 = _mm_shuffle_ps(V2,V1,_MM_SHUFFLE(2,2,0,0));\ - V1 = _mm_shuffle_ps(V1,V2,_MM_SHUFFLE(0,2,1,0));\ - V3 = _mm_shuffle_ps(V3,V4,_MM_SHUFFLE(0,0,2,2));\ - V3 = _mm_shuffle_ps(V3,V4,_MM_SHUFFLE(2,1,2,0));\ - -#endif - -/**************************************************************************** - * - * General Vector - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ -// Assignment operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ -// Return a vector with all elements equaling zero -inline XMVECTOR XM_CALLCONV XMVectorZero() -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f}; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_n_f32(0); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_setzero_ps(); -#endif -} - -//------------------------------------------------------------------------------ -// Initialize a vector with four floating point values -inline XMVECTOR XM_CALLCONV XMVectorSet -( - float x, - float y, - float z, - float w -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = {x,y,z,w}; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t V0 = vcreate_f32(((uint64_t)*(const uint32_t *)&x) | ((uint64_t)(*(const uint32_t *)&y) << 32)); - float32x2_t V1 = vcreate_f32(((uint64_t)*(const uint32_t *)&z) | ((uint64_t)(*(const uint32_t *)&w) << 32)); - return vcombine_f32(V0, V1); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_set_ps( w, z, y, x ); -#endif -} - -//------------------------------------------------------------------------------ -// Initialize a vector with four integer values -inline XMVECTOR XM_CALLCONV XMVectorSetInt -( - uint32_t x, - uint32_t y, - uint32_t z, - uint32_t w -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTORU32 vResult = {x,y,z,w}; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t V0 = vcreate_u32(((uint64_t)x) | ((uint64_t)y << 32)); - uint32x2_t V1 = vcreate_u32(((uint64_t)z) | ((uint64_t)w << 32)); - return vcombine_u32(V0, V1); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_set_epi32( w, z, y, x ); - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ -// Initialize a vector with a replicated floating point value -inline XMVECTOR XM_CALLCONV XMVectorReplicate -( - float Value -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult; - vResult.vector4_f32[0] = - vResult.vector4_f32[1] = - vResult.vector4_f32[2] = - vResult.vector4_f32[3] = Value; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_n_f32( Value ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_set_ps1( Value ); -#endif -} - -//------------------------------------------------------------------------------ -// Initialize a vector with a replicated floating point value passed by pointer -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr -( - const float *pValue -) -{ -#if defined(_XM_NO_INTRINSICS_) - float Value = pValue[0]; - XMVECTOR vResult; - vResult.vector4_f32[0] = - vResult.vector4_f32[1] = - vResult.vector4_f32[2] = - vResult.vector4_f32[3] = Value; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_dup_f32( pValue ); -#elif defined(_XM_AVX_INTRINSICS_) - return _mm_broadcast_ss( pValue ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_load_ps1( pValue ); -#endif -} - -//------------------------------------------------------------------------------ -// Initialize a vector with a replicated integer value -inline XMVECTOR XM_CALLCONV XMVectorReplicateInt -( - uint32_t Value -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTORU32 vResult; - vResult.u[0] = - vResult.u[1] = - vResult.u[2] = - vResult.u[3] = Value; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_n_u32( Value ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vTemp = _mm_set1_epi32( Value ); - return _mm_castsi128_ps(vTemp); -#endif -} - -//------------------------------------------------------------------------------ -// Initialize a vector with a replicated integer value passed by pointer -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr -( - const uint32_t *pValue -) -{ -#if defined(_XM_NO_INTRINSICS_) - uint32_t Value = pValue[0]; - XMVECTORU32 vResult; - vResult.u[0] = - vResult.u[1] = - vResult.u[2] = - vResult.u[3] = Value; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_dup_u32(pValue); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_load_ps1(reinterpret_cast(pValue)); -#endif -} - -//------------------------------------------------------------------------------ -// Initialize a vector with all bits set (true mask) -inline XMVECTOR XM_CALLCONV XMVectorTrueInt() -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTORU32 vResult = {0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU}; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_n_s32(-1); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_set1_epi32(-1); - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ -// Initialize a vector with all bits clear (false mask) -inline XMVECTOR XM_CALLCONV XMVectorFalseInt() -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f}; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_n_u32(0); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_setzero_ps(); -#endif -} - -//------------------------------------------------------------------------------ -// Replicate the x component of the vector -inline XMVECTOR XM_CALLCONV XMVectorSplatX -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult; - vResult.vector4_f32[0] = - vResult.vector4_f32[1] = - vResult.vector4_f32[2] = - vResult.vector4_f32[3] = V.vector4_f32[0]; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_lane_f32( vget_low_f32( V ), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); -#endif -} - -//------------------------------------------------------------------------------ -// Replicate the y component of the vector -inline XMVECTOR XM_CALLCONV XMVectorSplatY -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult; - vResult.vector4_f32[0] = - vResult.vector4_f32[1] = - vResult.vector4_f32[2] = - vResult.vector4_f32[3] = V.vector4_f32[1]; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_lane_f32( vget_low_f32( V ), 1 ); -#elif defined(_XM_SSE_INTRINSICS_) - return XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); -#endif -} - -//------------------------------------------------------------------------------ -// Replicate the z component of the vector -inline XMVECTOR XM_CALLCONV XMVectorSplatZ -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult; - vResult.vector4_f32[0] = - vResult.vector4_f32[1] = - vResult.vector4_f32[2] = - vResult.vector4_f32[3] = V.vector4_f32[2]; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_lane_f32( vget_high_f32( V ), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - return XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); -#endif -} - -//------------------------------------------------------------------------------ -// Replicate the w component of the vector -inline XMVECTOR XM_CALLCONV XMVectorSplatW -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult; - vResult.vector4_f32[0] = - vResult.vector4_f32[1] = - vResult.vector4_f32[2] = - vResult.vector4_f32[3] = V.vector4_f32[3]; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_lane_f32( vget_high_f32( V ), 1 ); -#elif defined(_XM_SSE_INTRINSICS_) - return XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); -#endif -} - -//------------------------------------------------------------------------------ -// Return a vector of 1.0f,1.0f,1.0f,1.0f -inline XMVECTOR XM_CALLCONV XMVectorSplatOne() -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult; - vResult.vector4_f32[0] = - vResult.vector4_f32[1] = - vResult.vector4_f32[2] = - vResult.vector4_f32[3] = 1.0f; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_n_f32(1.0f); -#elif defined(_XM_SSE_INTRINSICS_) - return g_XMOne; -#endif -} - -//------------------------------------------------------------------------------ -// Return a vector of INF,INF,INF,INF -inline XMVECTOR XM_CALLCONV XMVectorSplatInfinity() -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult; - vResult.vector4_u32[0] = - vResult.vector4_u32[1] = - vResult.vector4_u32[2] = - vResult.vector4_u32[3] = 0x7F800000; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_n_u32(0x7F800000); -#elif defined(_XM_SSE_INTRINSICS_) - return g_XMInfinity; -#endif -} - -//------------------------------------------------------------------------------ -// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN -inline XMVECTOR XM_CALLCONV XMVectorSplatQNaN() -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult; - vResult.vector4_u32[0] = - vResult.vector4_u32[1] = - vResult.vector4_u32[2] = - vResult.vector4_u32[3] = 0x7FC00000; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_n_u32(0x7FC00000); -#elif defined(_XM_SSE_INTRINSICS_) - return g_XMQNaN; -#endif -} - -//------------------------------------------------------------------------------ -// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f -inline XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult; - vResult.vector4_u32[0] = - vResult.vector4_u32[1] = - vResult.vector4_u32[2] = - vResult.vector4_u32[3] = 0x34000000; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_n_u32(0x34000000); -#elif defined(_XM_SSE_INTRINSICS_) - return g_XMEpsilon; -#endif -} - -//------------------------------------------------------------------------------ -// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f -inline XMVECTOR XM_CALLCONV XMVectorSplatSignMask() -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult; - vResult.vector4_u32[0] = - vResult.vector4_u32[1] = - vResult.vector4_u32[2] = - vResult.vector4_u32[3] = 0x80000000U; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vdupq_n_u32(0x80000000U); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_set1_epi32( 0x80000000 ); - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ -// Return a floating point value via an index. This is not a recommended -// function to use due to performance loss. -inline float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) -{ - assert( i < 4 ); - _Analysis_assume_( i < 4 ); -#if defined(_XM_NO_INTRINSICS_) - return V.vector4_f32[i]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return V.n128_f32[i]; -#elif defined(_XM_SSE_INTRINSICS_) - return V.m128_f32[i]; -#endif -} - -//------------------------------------------------------------------------------ -// Return the X component in an FPU register. -inline float XM_CALLCONV XMVectorGetX(FXMVECTOR V) -{ -#if defined(_XM_NO_INTRINSICS_) - return V.vector4_f32[0]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vgetq_lane_f32(V, 0); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_cvtss_f32(V); -#endif -} - -// Return the Y component in an FPU register. -inline float XM_CALLCONV XMVectorGetY(FXMVECTOR V) -{ -#if defined(_XM_NO_INTRINSICS_) - return V.vector4_f32[1]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vgetq_lane_f32(V, 1); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - return _mm_cvtss_f32(vTemp); -#endif -} - -// Return the Z component in an FPU register. -inline float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) -{ -#if defined(_XM_NO_INTRINSICS_) - return V.vector4_f32[2]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vgetq_lane_f32(V, 2); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - return _mm_cvtss_f32(vTemp); -#endif -} - -// Return the W component in an FPU register. -inline float XM_CALLCONV XMVectorGetW(FXMVECTOR V) -{ -#if defined(_XM_NO_INTRINSICS_) - return V.vector4_f32[3]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vgetq_lane_f32(V, 3); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); - return _mm_cvtss_f32(vTemp); -#endif -} - -//------------------------------------------------------------------------------ - -// Store a component indexed by i into a 32 bit float location in memory. -_Use_decl_annotations_ -inline void XM_CALLCONV XMVectorGetByIndexPtr(float *f, FXMVECTOR V, size_t i) -{ - assert( f != nullptr ); - assert( i < 4 ); - _Analysis_assume_( i < 4 ); -#if defined(_XM_NO_INTRINSICS_) - *f = V.vector4_f32[i]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - *f = V.n128_f32[i]; -#elif defined(_XM_SSE_INTRINSICS_) - *f = V.m128_f32[i]; -#endif -} - -//------------------------------------------------------------------------------ - -// Store the X component into a 32 bit float location in memory. -_Use_decl_annotations_ -inline void XM_CALLCONV XMVectorGetXPtr(float *x, FXMVECTOR V) -{ - assert( x != nullptr); -#if defined(_XM_NO_INTRINSICS_) - *x = V.vector4_f32[0]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_lane_f32(x,V,0); -#elif defined(_XM_SSE_INTRINSICS_) - _mm_store_ss(x,V); -#endif -} - -// Store the Y component into a 32 bit float location in memory. -_Use_decl_annotations_ -inline void XM_CALLCONV XMVectorGetYPtr(float *y, FXMVECTOR V) -{ - assert( y != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - *y = V.vector4_f32[1]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_lane_f32(y,V,1); -#elif defined(_XM_SSE4_INTRINSICS_) - *((int*)y) = _mm_extract_ps( V, 1 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - _mm_store_ss(y,vResult); -#endif -} - -// Store the Z component into a 32 bit float location in memory. -_Use_decl_annotations_ -inline void XM_CALLCONV XMVectorGetZPtr(float *z, FXMVECTOR V) -{ - assert( z != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - *z = V.vector4_f32[2]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_lane_f32(z,V,2); -#elif defined(_XM_SSE4_INTRINSICS_) - *((int*)z) = _mm_extract_ps( V, 2 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - _mm_store_ss(z,vResult); -#endif -} - -// Store the W component into a 32 bit float location in memory. -_Use_decl_annotations_ -inline void XM_CALLCONV XMVectorGetWPtr(float *w, FXMVECTOR V) -{ - assert( w != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - *w = V.vector4_f32[3]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_lane_f32(w,V,3); -#elif defined(_XM_SSE4_INTRINSICS_) - *((int*)w) = _mm_extract_ps( V, 3 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); - _mm_store_ss(w,vResult); -#endif -} - -//------------------------------------------------------------------------------ - -// Return an integer value via an index. This is not a recommended -// function to use due to performance loss. -inline uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) -{ - assert( i < 4 ); - _Analysis_assume_( i < 4 ); -#if defined(_XM_NO_INTRINSICS_) - return V.vector4_u32[i]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return V.n128_u32[i]; -#elif defined(_XM_SSE_INTRINSICS_) - return V.m128_u32[i]; -#endif -} - -//------------------------------------------------------------------------------ - -// Return the X component in an integer register. -inline uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) -{ -#if defined(_XM_NO_INTRINSICS_) - return V.vector4_u32[0]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vgetq_lane_u32(V, 0); -#elif defined(_XM_SSE_INTRINSICS_) - return static_cast(_mm_cvtsi128_si32(_mm_castps_si128(V))); -#endif -} - -// Return the Y component in an integer register. -inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) -{ -#if defined(_XM_NO_INTRINSICS_) - return V.vector4_u32[1]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vgetq_lane_u32(V, 1); -#elif defined(_XM_SSE4_INTRINSICS_) - __m128i V1 = _mm_castps_si128( V ); - return static_cast( _mm_extract_epi32( V1, 1 ) ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1)); - return static_cast(_mm_cvtsi128_si32(vResulti)); -#endif -} - -// Return the Z component in an integer register. -inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) -{ -#if defined(_XM_NO_INTRINSICS_) - return V.vector4_u32[2]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vgetq_lane_u32(V, 2); -#elif defined(_XM_SSE4_INTRINSICS_) - __m128i V1 = _mm_castps_si128( V ); - return static_cast( _mm_extract_epi32( V1, 2 ) ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2)); - return static_cast(_mm_cvtsi128_si32(vResulti)); -#endif -} - -// Return the W component in an integer register. -inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) -{ -#if defined(_XM_NO_INTRINSICS_) - return V.vector4_u32[3]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vgetq_lane_u32(V, 3); -#elif defined(_XM_SSE4_INTRINSICS_) - __m128i V1 = _mm_castps_si128( V ); - return static_cast( _mm_extract_epi32( V1, 3 ) ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3)); - return static_cast(_mm_cvtsi128_si32(vResulti)); -#endif -} - -//------------------------------------------------------------------------------ - -// Store a component indexed by i into a 32 bit integer location in memory. -_Use_decl_annotations_ -inline void XM_CALLCONV XMVectorGetIntByIndexPtr(uint32_t *x, FXMVECTOR V, size_t i) -{ - assert( x != nullptr ); - assert( i < 4 ); - _Analysis_assume_( i < 4 ); -#if defined(_XM_NO_INTRINSICS_) - *x = V.vector4_u32[i]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - *x = V.n128_u32[i]; -#elif defined(_XM_SSE_INTRINSICS_) - *x = V.m128_u32[i]; -#endif -} - -//------------------------------------------------------------------------------ - -// Store the X component into a 32 bit integer location in memory. -_Use_decl_annotations_ -inline void XM_CALLCONV XMVectorGetIntXPtr(uint32_t *x, FXMVECTOR V) -{ - assert( x != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - *x = V.vector4_u32[0]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_lane_u32(x,*reinterpret_cast(&V),0); -#elif defined(_XM_SSE_INTRINSICS_) - _mm_store_ss(reinterpret_cast(x),V); -#endif -} - -// Store the Y component into a 32 bit integer location in memory. -_Use_decl_annotations_ -inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V) -{ - assert( y != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - *y = V.vector4_u32[1]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_lane_u32(y,*reinterpret_cast(&V),1); -#elif defined(_XM_SSE4_INTRINSICS_) - __m128i V1 = _mm_castps_si128( V ); - *y = static_cast( _mm_extract_epi32( V1, 1 ) ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - _mm_store_ss(reinterpret_cast(y),vResult); -#endif -} - -// Store the Z component into a 32 bit integer locaCantion in memory. -_Use_decl_annotations_ -inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V) -{ - assert( z != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - *z = V.vector4_u32[2]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_lane_u32(z,*reinterpret_cast(&V),2); -#elif defined(_XM_SSE4_INTRINSICS_) - __m128i V1 = _mm_castps_si128( V ); - *z = static_cast( _mm_extract_epi32( V1, 2 ) ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - _mm_store_ss(reinterpret_cast(z),vResult); -#endif -} - -// Store the W component into a 32 bit integer location in memory. -_Use_decl_annotations_ -inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V) -{ - assert( w != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - *w = V.vector4_u32[3]; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - vst1q_lane_u32(w,*reinterpret_cast(&V),3); -#elif defined(_XM_SSE4_INTRINSICS_) - __m128i V1 = _mm_castps_si128( V ); - *w = static_cast( _mm_extract_epi32( V1, 3 ) ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); - _mm_store_ss(reinterpret_cast(w),vResult); -#endif -} - -//------------------------------------------------------------------------------ - -// Set a single indexed floating point component -inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) -{ - assert( i < 4 ); - _Analysis_assume_( i < 4 ); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U = V; - U.vector4_f32[i] = f; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR U = V; - U.n128_f32[i] = f; - return U; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR U = V; - U.m128_f32[i] = f; - return U; -#endif -} - -//------------------------------------------------------------------------------ - -// Sets the X component of a vector to a passed floating point value -inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_f32[0] = x; - U.vector4_f32[1] = V.vector4_f32[1]; - U.vector4_f32[2] = V.vector4_f32[2]; - U.vector4_f32[3] = V.vector4_f32[3]; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vsetq_lane_f32(x,V,0); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = _mm_set_ss(x); - vResult = _mm_move_ss(V,vResult); - return vResult; -#endif -} - -// Sets the Y component of a vector to a passed floating point value -inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_f32[0] = V.vector4_f32[0]; - U.vector4_f32[1] = y; - U.vector4_f32[2] = V.vector4_f32[2]; - U.vector4_f32[3] = V.vector4_f32[3]; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vsetq_lane_f32(y,V,1); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vResult = _mm_set_ss(y); - vResult = _mm_insert_ps( V, vResult, 0x10 ); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - // Swap y and x - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); - // Convert input to vector - XMVECTOR vTemp = _mm_set_ss(y); - // Replace the x component - vResult = _mm_move_ss(vResult,vTemp); - // Swap y and x again - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); - return vResult; -#endif -} -// Sets the Z component of a vector to a passed floating point value -inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_f32[0] = V.vector4_f32[0]; - U.vector4_f32[1] = V.vector4_f32[1]; - U.vector4_f32[2] = z; - U.vector4_f32[3] = V.vector4_f32[3]; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vsetq_lane_f32(z,V,2); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vResult = _mm_set_ss(z); - vResult = _mm_insert_ps( V, vResult, 0x20 ); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - // Swap z and x - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); - // Convert input to vector - XMVECTOR vTemp = _mm_set_ss(z); - // Replace the x component - vResult = _mm_move_ss(vResult,vTemp); - // Swap z and x again - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); - return vResult; -#endif -} - -// Sets the W component of a vector to a passed floating point value -inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_f32[0] = V.vector4_f32[0]; - U.vector4_f32[1] = V.vector4_f32[1]; - U.vector4_f32[2] = V.vector4_f32[2]; - U.vector4_f32[3] = w; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vsetq_lane_f32(w,V,3); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vResult = _mm_set_ss(w); - vResult = _mm_insert_ps( V, vResult, 0x30 ); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - // Swap w and x - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); - // Convert input to vector - XMVECTOR vTemp = _mm_set_ss(w); - // Replace the x component - vResult = _mm_move_ss(vResult,vTemp); - // Swap w and x again - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -// Sets a component of a vector to a floating point value passed by pointer -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(FXMVECTOR V, const float *f, size_t i) -{ - assert( f != nullptr ); - assert( i < 4 ); - _Analysis_assume_( i < 4 ); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U = V; - U.vector4_f32[i] = *f; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR U = V; - U.n128_f32[i] = *f; - return U; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR U = V; - U.m128_f32[i] = *f; - return U; -#endif -} - -//------------------------------------------------------------------------------ - -// Sets the X component of a vector to a floating point value passed by pointer -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorSetXPtr(FXMVECTOR V, const float *x) -{ - assert( x != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_f32[0] = *x; - U.vector4_f32[1] = V.vector4_f32[1]; - U.vector4_f32[2] = V.vector4_f32[2]; - U.vector4_f32[3] = V.vector4_f32[3]; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_lane_f32(x,V,0); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = _mm_load_ss(x); - vResult = _mm_move_ss(V,vResult); - return vResult; -#endif -} - -// Sets the Y component of a vector to a floating point value passed by pointer -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorSetYPtr(FXMVECTOR V, const float *y) -{ - assert( y != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_f32[0] = V.vector4_f32[0]; - U.vector4_f32[1] = *y; - U.vector4_f32[2] = V.vector4_f32[2]; - U.vector4_f32[3] = V.vector4_f32[3]; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_lane_f32(y,V,1); -#elif defined(_XM_SSE_INTRINSICS_) - // Swap y and x - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); - // Convert input to vector - XMVECTOR vTemp = _mm_load_ss(y); - // Replace the x component - vResult = _mm_move_ss(vResult,vTemp); - // Swap y and x again - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); - return vResult; -#endif -} - -// Sets the Z component of a vector to a floating point value passed by pointer -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorSetZPtr(FXMVECTOR V, const float *z) -{ - assert( z != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_f32[0] = V.vector4_f32[0]; - U.vector4_f32[1] = V.vector4_f32[1]; - U.vector4_f32[2] = *z; - U.vector4_f32[3] = V.vector4_f32[3]; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_lane_f32(z,V,2); -#elif defined(_XM_SSE_INTRINSICS_) - // Swap z and x - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); - // Convert input to vector - XMVECTOR vTemp = _mm_load_ss(z); - // Replace the x component - vResult = _mm_move_ss(vResult,vTemp); - // Swap z and x again - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); - return vResult; -#endif -} - -// Sets the W component of a vector to a floating point value passed by pointer -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorSetWPtr(FXMVECTOR V, const float *w) -{ - assert( w != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_f32[0] = V.vector4_f32[0]; - U.vector4_f32[1] = V.vector4_f32[1]; - U.vector4_f32[2] = V.vector4_f32[2]; - U.vector4_f32[3] = *w; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_lane_f32(w,V,3); -#elif defined(_XM_SSE_INTRINSICS_) - // Swap w and x - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); - // Convert input to vector - XMVECTOR vTemp = _mm_load_ss(w); - // Replace the x component - vResult = _mm_move_ss(vResult,vTemp); - // Swap w and x again - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -// Sets a component of a vector to an integer passed by value -inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) -{ - assert( i < 4 ); - _Analysis_assume_( i < 4 ); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U = V; - U.vector4_u32[i] = x; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTORU32 tmp; - tmp.v = V; - tmp.u[i] = x; - return tmp; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTORU32 tmp; - tmp.v = V; - tmp.u[i] = x; - return tmp; -#endif -} - -//------------------------------------------------------------------------------ - -// Sets the X component of a vector to an integer passed by value -inline XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_u32[0] = x; - U.vector4_u32[1] = V.vector4_u32[1]; - U.vector4_u32[2] = V.vector4_u32[2]; - U.vector4_u32[3] = V.vector4_u32[3]; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vsetq_lane_u32(x,V,0); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vTemp = _mm_cvtsi32_si128(x); - XMVECTOR vResult = _mm_move_ss(V,_mm_castsi128_ps(vTemp)); - return vResult; -#endif -} - -// Sets the Y component of a vector to an integer passed by value -inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_u32[0] = V.vector4_u32[0]; - U.vector4_u32[1] = y; - U.vector4_u32[2] = V.vector4_u32[2]; - U.vector4_u32[3] = V.vector4_u32[3]; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vsetq_lane_u32(y,V,1); -#elif defined(_XM_SSE4_INTRINSICS_) - __m128i vResult = _mm_castps_si128( V ); - vResult = _mm_insert_epi32( vResult, static_cast(y), 1 ); - return _mm_castsi128_ps( vResult ); -#elif defined(_XM_SSE_INTRINSICS_) - // Swap y and x - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); - // Convert input to vector - __m128i vTemp = _mm_cvtsi32_si128(y); - // Replace the x component - vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); - // Swap y and x again - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); - return vResult; -#endif -} - -// Sets the Z component of a vector to an integer passed by value -inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_u32[0] = V.vector4_u32[0]; - U.vector4_u32[1] = V.vector4_u32[1]; - U.vector4_u32[2] = z; - U.vector4_u32[3] = V.vector4_u32[3]; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vsetq_lane_u32(z,V,2); -#elif defined(_XM_SSE4_INTRINSICS_) - __m128i vResult = _mm_castps_si128( V ); - vResult = _mm_insert_epi32( vResult, static_cast(z), 2 ); - return _mm_castsi128_ps( vResult ); -#elif defined(_XM_SSE_INTRINSICS_) - // Swap z and x - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); - // Convert input to vector - __m128i vTemp = _mm_cvtsi32_si128(z); - // Replace the x component - vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); - // Swap z and x again - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); - return vResult; -#endif -} - -// Sets the W component of a vector to an integer passed by value -inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_u32[0] = V.vector4_u32[0]; - U.vector4_u32[1] = V.vector4_u32[1]; - U.vector4_u32[2] = V.vector4_u32[2]; - U.vector4_u32[3] = w; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vsetq_lane_u32(w,V,3); -#elif defined(_XM_SSE4_INTRINSICS_) - __m128i vResult = _mm_castps_si128( V ); - vResult = _mm_insert_epi32( vResult, static_cast(w), 3 ); - return _mm_castsi128_ps( vResult ); -#elif defined(_XM_SSE_INTRINSICS_) - // Swap w and x - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); - // Convert input to vector - __m128i vTemp = _mm_cvtsi32_si128(w); - // Replace the x component - vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); - // Swap w and x again - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -// Sets a component of a vector to an integer value passed by pointer -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t *x, size_t i) -{ - assert( x != nullptr ); - assert( i < 4 ); - _Analysis_assume_( i < 4 ); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U = V; - U.vector4_u32[i] = *x; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTORU32 tmp; - tmp.v = V; - tmp.u[i] = *x; - return tmp; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTORU32 tmp; - tmp.v = V; - tmp.u[i] = *x; - return tmp; -#endif -} - -//------------------------------------------------------------------------------ - -// Sets the X component of a vector to an integer value passed by pointer -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t *x) -{ - assert( x != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_u32[0] = *x; - U.vector4_u32[1] = V.vector4_u32[1]; - U.vector4_u32[2] = V.vector4_u32[2]; - U.vector4_u32[3] = V.vector4_u32[3]; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_lane_u32(x,*reinterpret_cast(&V),0); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(x)); - XMVECTOR vResult = _mm_move_ss(V,vTemp); - return vResult; -#endif -} - -// Sets the Y component of a vector to an integer value passed by pointer -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t *y) -{ - assert( y != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_u32[0] = V.vector4_u32[0]; - U.vector4_u32[1] = *y; - U.vector4_u32[2] = V.vector4_u32[2]; - U.vector4_u32[3] = V.vector4_u32[3]; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_lane_u32(y,*reinterpret_cast(&V),1); -#elif defined(_XM_SSE_INTRINSICS_) - // Swap y and x - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); - // Convert input to vector - XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(y)); - // Replace the x component - vResult = _mm_move_ss(vResult,vTemp); - // Swap y and x again - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); - return vResult; -#endif -} - -// Sets the Z component of a vector to an integer value passed by pointer -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t *z) -{ - assert( z != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_u32[0] = V.vector4_u32[0]; - U.vector4_u32[1] = V.vector4_u32[1]; - U.vector4_u32[2] = *z; - U.vector4_u32[3] = V.vector4_u32[3]; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_lane_u32(z,*reinterpret_cast(&V),2); -#elif defined(_XM_SSE_INTRINSICS_) - // Swap z and x - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); - // Convert input to vector - XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(z)); - // Replace the x component - vResult = _mm_move_ss(vResult,vTemp); - // Swap z and x again - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); - return vResult; -#endif -} - -// Sets the W component of a vector to an integer value passed by pointer -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t *w) -{ - assert( w != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR U; - U.vector4_u32[0] = V.vector4_u32[0]; - U.vector4_u32[1] = V.vector4_u32[1]; - U.vector4_u32[2] = V.vector4_u32[2]; - U.vector4_u32[3] = *w; - return U; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vld1q_lane_u32(w,*reinterpret_cast(&V),3); -#elif defined(_XM_SSE_INTRINSICS_) - // Swap w and x - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); - // Convert input to vector - XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(w)); - // Replace the x component - vResult = _mm_move_ss(vResult,vTemp); - // Swap w and x again - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorSwizzle -( - FXMVECTOR V, - uint32_t E0, - uint32_t E1, - uint32_t E2, - uint32_t E3 -) -{ - assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); - _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result = { V.vector4_f32[E0], - V.vector4_f32[E1], - V.vector4_f32[E2], - V.vector4_f32[E3] }; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const uint32_t ControlElement[ 4 ] = - { - 0x03020100, // XM_SWIZZLE_X - 0x07060504, // XM_SWIZZLE_Y - 0x0B0A0908, // XM_SWIZZLE_Z - 0x0F0E0D0C, // XM_SWIZZLE_W - }; - - int8x8x2_t tbl; - tbl.val[0] = vget_low_f32(V); - tbl.val[1] = vget_high_f32(V); - - uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[E0]) | (((uint64_t)ControlElement[E1]) << 32) ); - const uint8x8_t rL = vtbl2_u8( tbl, idx ); - - idx = vcreate_u32( ((uint64_t)ControlElement[E2]) | (((uint64_t)ControlElement[E3]) << 32) ); - const uint8x8_t rH = vtbl2_u8( tbl, idx ); - - return vcombine_f32( rL, rH ); -#elif defined(_XM_AVX_INTRINSICS_) - unsigned int elem[4] = { E0, E1, E2, E3 }; - __m128i vControl = _mm_loadu_si128( reinterpret_cast(&elem[0]) ); - return _mm_permutevar_ps( V, vControl ); -#else - const uint32_t *aPtr = (const uint32_t* )(&V); - - XMVECTOR Result; - uint32_t *pWork = (uint32_t*)(&Result); - - pWork[0] = aPtr[E0]; - pWork[1] = aPtr[E1]; - pWork[2] = aPtr[E2]; - pWork[3] = aPtr[E3]; - - return Result; -#endif -} - -//------------------------------------------------------------------------------ -inline XMVECTOR XM_CALLCONV XMVectorPermute -( - FXMVECTOR V1, - FXMVECTOR V2, - uint32_t PermuteX, - uint32_t PermuteY, - uint32_t PermuteZ, - uint32_t PermuteW -) -{ - assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); - _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); - -#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - static const uint32_t ControlElement[ 8 ] = - { - 0x03020100, // XM_PERMUTE_0X - 0x07060504, // XM_PERMUTE_0Y - 0x0B0A0908, // XM_PERMUTE_0Z - 0x0F0E0D0C, // XM_PERMUTE_0W - 0x13121110, // XM_PERMUTE_1X - 0x17161514, // XM_PERMUTE_1Y - 0x1B1A1918, // XM_PERMUTE_1Z - 0x1F1E1D1C, // XM_PERMUTE_1W - }; - - int8x8x4_t tbl; - tbl.val[0] = vget_low_f32(V1); - tbl.val[1] = vget_high_f32(V1); - tbl.val[2] = vget_low_f32(V2); - tbl.val[3] = vget_high_f32(V2); - - uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[PermuteX]) | (((uint64_t)ControlElement[PermuteY]) << 32) ); - const uint8x8_t rL = vtbl4_u8( tbl, idx ); - - idx = vcreate_u32( ((uint64_t)ControlElement[PermuteZ]) | (((uint64_t)ControlElement[PermuteW]) << 32) ); - const uint8x8_t rH = vtbl4_u8( tbl, idx ); - - return vcombine_f32( rL, rH ); -#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - static const XMVECTORU32 three = { 3, 3, 3, 3 }; - - _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW }; - __m128i vControl = _mm_load_si128( reinterpret_cast(&elem[0]) ); - - __m128i vSelect = _mm_cmpgt_epi32( vControl, three ); - vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) ); - - __m128 shuffled1 = _mm_permutevar_ps( V1, vControl ); - __m128 shuffled2 = _mm_permutevar_ps( V2, vControl ); - - __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 ); - __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 ); - - return _mm_or_ps( masked1, masked2 ); -#else - - const uint32_t *aPtr[2]; - aPtr[0] = (const uint32_t* )(&V1); - aPtr[1] = (const uint32_t* )(&V2); - - XMVECTOR Result; - uint32_t *pWork = (uint32_t*)(&Result); - - const uint32_t i0 = PermuteX & 3; - const uint32_t vi0 = PermuteX >> 2; - pWork[0] = aPtr[vi0][i0]; - - const uint32_t i1 = PermuteY & 3; - const uint32_t vi1 = PermuteY >> 2; - pWork[1] = aPtr[vi1][i1]; - - const uint32_t i2 = PermuteZ & 3; - const uint32_t vi2 = PermuteZ >> 2; - pWork[2] = aPtr[vi2][i2]; - - const uint32_t i3 = PermuteW & 3; - const uint32_t vi3 = PermuteW >> 2; - pWork[3] = aPtr[vi3][i3]; - - return Result; -#endif -} - -//------------------------------------------------------------------------------ -// Define a control vector to be used in XMVectorSelect -// operations. The four integers specified in XMVectorSelectControl -// serve as indices to select between components in two vectors. -// The first index controls selection for the first component of -// the vectors involved in a select operation, the second index -// controls selection for the second component etc. A value of -// zero for an index causes the corresponding component from the first -// vector to be selected whereas a one causes the component from the -// second vector to be selected instead. - -inline XMVECTOR XM_CALLCONV XMVectorSelectControl -( - uint32_t VectorIndex0, - uint32_t VectorIndex1, - uint32_t VectorIndex2, - uint32_t VectorIndex3 -) -{ -#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - // x=Index0,y=Index1,z=Index2,w=Index3 - __m128i vTemp = _mm_set_epi32(VectorIndex3,VectorIndex2,VectorIndex1,VectorIndex0); - // Any non-zero entries become 0xFFFFFFFF else 0 - vTemp = _mm_cmpgt_epi32(vTemp,g_XMZero); - return _mm_castsi128_ps(vTemp); -#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - int32x2_t V0 = vcreate_s32(((uint64_t)VectorIndex0) | ((uint64_t)VectorIndex1 << 32)); - int32x2_t V1 = vcreate_s32(((uint64_t)VectorIndex2) | ((uint64_t)VectorIndex3 << 32)); - int32x4_t vTemp = vcombine_s32(V0, V1); - // Any non-zero entries become 0xFFFFFFFF else 0 - return vcgtq_s32(vTemp,g_XMZero); -#else - XMVECTOR ControlVector; - const uint32_t ControlElement[] = - { - XM_SELECT_0, - XM_SELECT_1 - }; - - assert(VectorIndex0 < 2); - assert(VectorIndex1 < 2); - assert(VectorIndex2 < 2); - assert(VectorIndex3 < 2); - _Analysis_assume_(VectorIndex0 < 2); - _Analysis_assume_(VectorIndex1 < 2); - _Analysis_assume_(VectorIndex2 < 2); - _Analysis_assume_(VectorIndex3 < 2); - - ControlVector.vector4_u32[0] = ControlElement[VectorIndex0]; - ControlVector.vector4_u32[1] = ControlElement[VectorIndex1]; - ControlVector.vector4_u32[2] = ControlElement[VectorIndex2]; - ControlVector.vector4_u32[3] = ControlElement[VectorIndex3]; - - return ControlVector; - -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorSelect -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR Control -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_u32[0] = (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]); - Result.vector4_u32[1] = (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]); - Result.vector4_u32[2] = (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]); - Result.vector4_u32[3] = (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vbslq_f32( Control, V2, V1 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp1 = _mm_andnot_ps(Control,V1); - XMVECTOR vTemp2 = _mm_and_ps(V2,Control); - return _mm_or_ps(vTemp1,vTemp2); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorMergeXY -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_u32[0] = V1.vector4_u32[0]; - Result.vector4_u32[1] = V2.vector4_u32[0]; - Result.vector4_u32[2] = V1.vector4_u32[1]; - Result.vector4_u32[3] = V2.vector4_u32[1]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vzipq_f32( V1, V2 ).val[0]; -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_unpacklo_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorMergeZW -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_u32[0] = V1.vector4_u32[2]; - Result.vector4_u32[1] = V2.vector4_u32[2]; - Result.vector4_u32[2] = V1.vector4_u32[3]; - Result.vector4_u32[3] = V2.vector4_u32[3]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vzipq_f32( V1, V2 ).val[1]; -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_unpackhi_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) -{ - assert( Elements < 4 ); - _Analysis_assume_( Elements < 4 ); - return XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3)); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) -{ - assert( Elements < 4 ); - _Analysis_assume_( Elements < 4 ); - return XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 ); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) -{ - assert( Elements < 4 ); - _Analysis_assume_( Elements < 4 ); - return XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 ); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements, - uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) -{ - XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1); - return XMVectorSelect( VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control ); -} - -//------------------------------------------------------------------------------ -// Comparison operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Control; - Control.vector4_u32[0] = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[1] = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[2] = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[3] = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vceqq_f32( V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_cmpeq_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorEqualR -( - uint32_t* pCR, - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - assert( pCR != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; - uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; - uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; - uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; - uint32_t CR = 0; - if (ux&uy&uz&uw) - { - // All elements are greater - CR = XM_CRMASK_CR6TRUE; - } - else if (!(ux|uy|uz|uw)) - { - // All elements are not greater - CR = XM_CRMASK_CR6FALSE; - } - *pCR = CR; - - XMVECTOR Control; - Control.vector4_u32[0] = ux; - Control.vector4_u32[1] = uy; - Control.vector4_u32[2] = uz; - Control.vector4_u32[3] = uw; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1); - uint32_t CR = 0; - if ( r == 0xFFFFFFFFU ) - { - // All elements are equal - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - // All elements are not equal - CR = XM_CRMASK_CR6FALSE; - } - *pCR = CR; - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); - uint32_t CR = 0; - int iTest = _mm_movemask_ps(vTemp); - if (iTest==0xf) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTest) - { - // All elements are not greater - CR = XM_CRMASK_CR6FALSE; - } - *pCR = CR; - return vTemp; -#endif -} - -//------------------------------------------------------------------------------ -// Treat the components of the vectors as unsigned integers and -// compare individual bits between the two. This is useful for -// comparing control vectors and result vectors returned from -// other comparison operations. - -inline XMVECTOR XM_CALLCONV XMVectorEqualInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Control; - Control.vector4_u32[0] = (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[1] = (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[2] = (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[3] = (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vceqq_u32( V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorEqualIntR -( - uint32_t* pCR, - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - assert( pCR != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Control = XMVectorEqualInt(V1, V2); - - *pCR = 0; - if (XMVector4EqualInt(Control, XMVectorTrueInt())) - { - // All elements are equal - *pCR |= XM_CRMASK_CR6TRUE; - } - else if (XMVector4EqualInt(Control, XMVectorFalseInt())) - { - // All elements are not equal - *pCR |= XM_CRMASK_CR6FALSE; - } - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_u32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1); - uint32_t CR = 0; - if ( r == 0xFFFFFFFFU ) - { - // All elements are equal - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - // All elements are not equal - CR = XM_CRMASK_CR6FALSE; - } - *pCR = CR; - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); - int iTemp = _mm_movemask_ps(_mm_castsi128_ps(V)); - uint32_t CR = 0; - if (iTemp==0x0F) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTemp) - { - CR = XM_CRMASK_CR6FALSE; - } - *pCR = CR; - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorNearEqual -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR Epsilon -) -{ -#if defined(_XM_NO_INTRINSICS_) - - float fDeltax = V1.vector4_f32[0]-V2.vector4_f32[0]; - float fDeltay = V1.vector4_f32[1]-V2.vector4_f32[1]; - float fDeltaz = V1.vector4_f32[2]-V2.vector4_f32[2]; - float fDeltaw = V1.vector4_f32[3]-V2.vector4_f32[3]; - - fDeltax = fabsf(fDeltax); - fDeltay = fabsf(fDeltay); - fDeltaz = fabsf(fDeltaz); - fDeltaw = fabsf(fDeltaw); - - XMVECTOR Control; - Control.vector4_u32[0] = (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0; - Control.vector4_u32[1] = (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0; - Control.vector4_u32[2] = (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0; - Control.vector4_u32[3] = (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR vDelta = vsubq_f32(V1,V2); - return vacleq_f32( vDelta, Epsilon ); -#elif defined(_XM_SSE_INTRINSICS_) - // Get the difference - XMVECTOR vDelta = _mm_sub_ps(V1,V2); - // Get the absolute value of the difference - XMVECTOR vTemp = _mm_setzero_ps(); - vTemp = _mm_sub_ps(vTemp,vDelta); - vTemp = _mm_max_ps(vTemp,vDelta); - vTemp = _mm_cmple_ps(vTemp,Epsilon); - return vTemp; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorNotEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Control; - Control.vector4_u32[0] = (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[1] = (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[2] = (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[3] = (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vmvnq_u32(vceqq_f32(V1, V2)); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_cmpneq_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorNotEqualInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Control; - Control.vector4_u32[0] = (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0; - Control.vector4_u32[1] = (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0; - Control.vector4_u32[2] = (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0; - Control.vector4_u32[3] = (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vmvnq_u32(vceqq_u32(V1, V2)); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); - return _mm_xor_ps(_mm_castsi128_ps(V),g_XMNegOneMask); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorGreater -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Control; - Control.vector4_u32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vcgtq_f32( V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_cmpgt_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorGreaterR -( - uint32_t* pCR, - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - assert( pCR != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - - uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; - uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; - uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; - uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; - uint32_t CR = 0; - if (ux&uy&uz&uw) - { - // All elements are greater - CR = XM_CRMASK_CR6TRUE; - } - else if (!(ux|uy|uz|uw)) - { - // All elements are not greater - CR = XM_CRMASK_CR6FALSE; - } - *pCR = CR; - - XMVECTOR Control; - Control.vector4_u32[0] = ux; - Control.vector4_u32[1] = uy; - Control.vector4_u32[2] = uz; - Control.vector4_u32[3] = uw; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcgtq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1); - uint32_t CR = 0; - if ( r == 0xFFFFFFFFU ) - { - // All elements are greater - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - // All elements are not greater - CR = XM_CRMASK_CR6FALSE; - } - *pCR = CR; - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); - uint32_t CR = 0; - int iTest = _mm_movemask_ps(vTemp); - if (iTest==0xf) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTest) - { - // All elements are not greater - CR = XM_CRMASK_CR6FALSE; - } - *pCR = CR; - return vTemp; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Control; - Control.vector4_u32[0] = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[1] = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[2] = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[3] = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vcgeq_f32( V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_cmpge_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR -( - uint32_t* pCR, - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - assert( pCR != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - - uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; - uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; - uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; - uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; - uint32_t CR = 0; - if (ux&uy&uz&uw) - { - // All elements are greater - CR = XM_CRMASK_CR6TRUE; - } - else if (!(ux|uy|uz|uw)) - { - // All elements are not greater - CR = XM_CRMASK_CR6FALSE; - } - *pCR = CR; - - XMVECTOR Control; - Control.vector4_u32[0] = ux; - Control.vector4_u32[1] = uy; - Control.vector4_u32[2] = uz; - Control.vector4_u32[3] = uw; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcgeq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1); - uint32_t CR = 0; - if ( r == 0xFFFFFFFFU ) - { - // All elements are greater or equal - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - // All elements are not greater or equal - CR = XM_CRMASK_CR6FALSE; - } - *pCR = CR; - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); - uint32_t CR = 0; - int iTest = _mm_movemask_ps(vTemp); - if (iTest==0xf) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTest) - { - // All elements are not greater - CR = XM_CRMASK_CR6FALSE; - } - *pCR = CR; - return vTemp; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorLess -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Control; - Control.vector4_u32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vcltq_f32( V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_cmplt_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorLessOrEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Control; - Control.vector4_u32[0] = (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[1] = (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[2] = (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[3] = (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vcleq_f32( V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_cmple_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorInBounds -( - FXMVECTOR V, - FXMVECTOR Bounds -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Control; - Control.vector4_u32[0] = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[1] = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[2] = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0; - Control.vector4_u32[3] = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Test if less than or equal - XMVECTOR vTemp1 = vcleq_f32(V,Bounds); - // Negate the bounds - XMVECTOR vTemp2 = vnegq_f32(Bounds); - // Test if greater or equal (Reversed) - vTemp2 = vcleq_f32(vTemp2,V); - // Blend answers - vTemp1 = vandq_u32(vTemp1,vTemp2); - return vTemp1; -#elif defined(_XM_SSE_INTRINSICS_) - // Test if less than or equal - XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); - // Negate the bounds - XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); - // Test if greater or equal (Reversed) - vTemp2 = _mm_cmple_ps(vTemp2,V); - // Blend answers - vTemp1 = _mm_and_ps(vTemp1,vTemp2); - return vTemp1; -#endif -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV XMVectorInBoundsR -( - uint32_t* pCR, - FXMVECTOR V, - FXMVECTOR Bounds -) -{ - assert( pCR != nullptr ); -#if defined(_XM_NO_INTRINSICS_) - - uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0; - uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0; - uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0; - uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0; - - uint32_t CR = 0; - if (ux&uy&uz&uw) - { - // All elements are in bounds - CR = XM_CRMASK_CR6BOUNDS; - } - *pCR = CR; - - XMVECTOR Control; - Control.vector4_u32[0] = ux; - Control.vector4_u32[1] = uy; - Control.vector4_u32[2] = uz; - Control.vector4_u32[3] = uw; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Test if less than or equal - XMVECTOR vTemp1 = vcleq_f32(V,Bounds); - // Negate the bounds - XMVECTOR vTemp2 = vnegq_f32(Bounds); - // Test if greater or equal (Reversed) - vTemp2 = vcleq_f32(vTemp2,V); - // Blend answers - vTemp1 = vandq_u32(vTemp1,vTemp2); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1); - uint32_t CR = 0; - if ( r == 0xFFFFFFFFU ) - { - // All elements are in bounds - CR = XM_CRMASK_CR6BOUNDS; - } - *pCR = CR; - return vTemp1; -#elif defined(_XM_SSE_INTRINSICS_) - // Test if less than or equal - XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); - // Negate the bounds - XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); - // Test if greater or equal (Reversed) - vTemp2 = _mm_cmple_ps(vTemp2,V); - // Blend answers - vTemp1 = _mm_and_ps(vTemp1,vTemp2); - - uint32_t CR = 0; - if (_mm_movemask_ps(vTemp1)==0xf) { - // All elements are in bounds - CR = XM_CRMASK_CR6BOUNDS; - } - *pCR = CR; - return vTemp1; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorIsNaN -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Control; - Control.vector4_u32[0] = XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0; - Control.vector4_u32[1] = XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0; - Control.vector4_u32[2] = XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0; - Control.vector4_u32[3] = XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Test against itself. NaN is always not equal - uint32x4_t vTempNan = vceqq_f32( V, V ); - // Flip results - return vmvnq_u32( vTempNan ); -#elif defined(_XM_SSE_INTRINSICS_) - // Test against itself. NaN is always not equal - return _mm_cmpneq_ps(V,V); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorIsInfinite -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Control; - Control.vector4_u32[0] = XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0; - Control.vector4_u32[1] = XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0; - Control.vector4_u32[2] = XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0; - Control.vector4_u32[3] = XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0; - return Control; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Mask off the sign bit - uint32x4_t vTemp = vandq_u32(V,g_XMAbsMask); - // Compare to infinity - vTemp = vceqq_f32(vTemp,g_XMInfinity); - // If any are infinity, the signs are true. - return vTemp; -#elif defined(_XM_SSE_INTRINSICS_) - // Mask off the sign bit - __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); - // Compare to infinity - vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); - // If any are infinity, the signs are true. - return vTemp; -#endif -} - -//------------------------------------------------------------------------------ -// Rounding and clamping operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorMin -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0]; - Result.vector4_f32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1]; - Result.vector4_f32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2]; - Result.vector4_f32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vminq_f32( V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_min_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorMax -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0]; - Result.vector4_f32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1]; - Result.vector4_f32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2]; - Result.vector4_f32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vmaxq_f32( V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_max_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -#ifdef _XM_NO_ROUNDF_ - -namespace Internal -{ - inline float round_to_nearest( float x ) - { - float i = floorf(x); - x -= i; - if(x < 0.5f) - return i; - if(x > 0.5f) - return i + 1.f; - - float int_part; - modff( i / 2.f, &int_part ); - if ( (2.f*int_part) == i ) - { - return i; - } - - return i + 1.f; - } -}; - -#endif - -#if !defined(_XM_NO_INTRINSICS_) -#pragma float_control(push) -#pragma float_control(precise, on) -#endif - -inline XMVECTOR XM_CALLCONV XMVectorRound -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - -#ifdef _XM_NO_ROUNDF_ - XMVECTOR Result; - Result.vector4_f32[0] = Internal::round_to_nearest( V.vector4_f32[0] ); - Result.vector4_f32[1] = Internal::round_to_nearest( V.vector4_f32[1] ); - Result.vector4_f32[2] = Internal::round_to_nearest( V.vector4_f32[2] ); - Result.vector4_f32[3] = Internal::round_to_nearest( V.vector4_f32[3] ); - return Result; -#else - XMVECTOR Result; - Result.vector4_f32[0] = roundf( V.vector4_f32[0] ); - Result.vector4_f32[1] = roundf( V.vector4_f32[1] ); - Result.vector4_f32[2] = roundf( V.vector4_f32[2] ); - Result.vector4_f32[3] = roundf( V.vector4_f32[3] ); - return Result; -#endif - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t sign = vandq_u32( V, g_XMNegativeZero ); - uint32x4_t sMagic = vorrq_u32( g_XMNoFraction, sign ); - float32x4_t R1 = vaddq_f32( V, sMagic ); - R1 = vsubq_f32( R1, sMagic ); - float32x4_t R2 = vabsq_f32( V ); - uint32x4_t mask = vcleq_f32( R2, g_XMNoFraction ); - XMVECTOR vResult = vbslq_f32( mask, R1, V ); - return vResult; -#elif defined(_XM_SSE4_INTRINSICS_) - return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128 sign = _mm_and_ps( V, g_XMNegativeZero ); - __m128 sMagic = _mm_or_ps( g_XMNoFraction, sign ); - __m128 R1 = _mm_add_ps( V, sMagic ); - R1 = _mm_sub_ps( R1, sMagic ); - __m128 R2 = _mm_and_ps( V, g_XMAbsMask ); - __m128 mask = _mm_cmple_ps( R2, g_XMNoFraction ); - R2 = _mm_andnot_ps(mask,V); - R1 = _mm_and_ps(R1,mask); - XMVECTOR vResult = _mm_xor_ps(R1, R2); - return vResult; -#endif -} - -#if !defined(_XM_NO_INTRINSICS_) -#pragma float_control(pop) -#endif - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorTruncate -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - uint32_t i; - - // Avoid C4701 - Result.vector4_f32[0] = 0.0f; - - for (i = 0; i < 4; i++) - { - if (XMISNAN(V.vector4_f32[i])) - { - Result.vector4_u32[i] = 0x7FC00000; - } - else if (fabsf(V.vector4_f32[i]) < 8388608.0f) - { - Result.vector4_f32[i] = (float)((int32_t)V.vector4_f32[i]); - } - else - { - Result.vector4_f32[i] = V.vector4_f32[i]; - } - } - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vTest = vabsq_f32( V ); - vTest = vcltq_f32( vTest, g_XMNoFraction ); - - int32x4_t vInt = vcvtq_s32_f32( V ); - XMVECTOR vResult = vcvtq_f32_s32( vInt ); - - // All numbers less than 8388608 will use the round to int - // All others, use the ORIGINAL value - return vbslq_f32( vTest, vResult, V ); -#elif defined(_XM_SSE4_INTRINSICS_) - return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ); -#elif defined(_XM_SSE_INTRINSICS_) - // To handle NAN, INF and numbers greater than 8388608, use masking - // Get the abs value - __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); - // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF - vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); - // Convert to int and back to float for rounding with truncation - __m128i vInt = _mm_cvttps_epi32(V); - // Convert back to floats - XMVECTOR vResult = _mm_cvtepi32_ps(vInt); - // All numbers less than 8388608 will use the round to int - vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest)); - // All others, use the ORIGINAL value - vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); - vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest)); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorFloor -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = floorf( V.vector4_f32[0] ); - Result.vector4_f32[1] = floorf( V.vector4_f32[1] ); - Result.vector4_f32[2] = floorf( V.vector4_f32[2] ); - Result.vector4_f32[3] = floorf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vTest = vabsq_f32( V ); - vTest = vcltq_f32( vTest, g_XMNoFraction ); - // Truncate - int32x4_t vInt = vcvtq_s32_f32( V ); - XMVECTOR vResult = vcvtq_f32_s32( vInt ); - XMVECTOR vLarger = vcgtq_f32( vResult, V ); - // 0 -> 0, 0xffffffff -> -1.0f - vLarger = vcvtq_f32_s32( vLarger ); - vResult = vaddq_f32( vResult, vLarger ); - // All numbers less than 8388608 will use the round to int - // All others, use the ORIGINAL value - return vbslq_f32( vTest, vResult, V ); -#elif defined(_XM_SSE4_INTRINSICS_) - return _mm_floor_ps( V ); -#elif defined(_XM_SSE_INTRINSICS_) - // To handle NAN, INF and numbers greater than 8388608, use masking - __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); - vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); - // Truncate - __m128i vInt = _mm_cvttps_epi32(V); - XMVECTOR vResult = _mm_cvtepi32_ps(vInt); - __m128 vLarger = _mm_cmpgt_ps( vResult, V ); - // 0 -> 0, 0xffffffff -> -1.0f - vLarger = _mm_cvtepi32_ps( _mm_castps_si128( vLarger ) ); - vResult = _mm_add_ps( vResult, vLarger ); - // All numbers less than 8388608 will use the round to int - vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest)); - // All others, use the ORIGINAL value - vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); - vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest)); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorCeiling -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = ceilf( V.vector4_f32[0] ); - Result.vector4_f32[1] = ceilf( V.vector4_f32[1] ); - Result.vector4_f32[2] = ceilf( V.vector4_f32[2] ); - Result.vector4_f32[3] = ceilf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vTest = vabsq_f32( V ); - vTest = vcltq_f32( vTest, g_XMNoFraction ); - // Truncate - int32x4_t vInt = vcvtq_s32_f32( V ); - XMVECTOR vResult = vcvtq_f32_s32( vInt ); - XMVECTOR vSmaller = vcltq_f32( vResult, V ); - // 0 -> 0, 0xffffffff -> -1.0f - vSmaller = vcvtq_f32_s32( vSmaller ); - vResult = vsubq_f32( vResult, vSmaller ); - // All numbers less than 8388608 will use the round to int - // All others, use the ORIGINAL value - return vbslq_f32( vTest, vResult, V ); -#elif defined(_XM_SSE4_INTRINSICS_) - return _mm_ceil_ps( V ); -#elif defined(_XM_SSE_INTRINSICS_) - // To handle NAN, INF and numbers greater than 8388608, use masking - __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); - vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); - // Truncate - __m128i vInt = _mm_cvttps_epi32(V); - XMVECTOR vResult = _mm_cvtepi32_ps(vInt); - __m128 vSmaller = _mm_cmplt_ps( vResult, V ); - // 0 -> 0, 0xffffffff -> -1.0f - vSmaller = _mm_cvtepi32_ps( _mm_castps_si128( vSmaller ) ); - vResult = _mm_sub_ps( vResult, vSmaller ); - // All numbers less than 8388608 will use the round to int - vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest)); - // All others, use the ORIGINAL value - vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); - vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest)); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorClamp -( - FXMVECTOR V, - FXMVECTOR Min, - FXMVECTOR Max -) -{ - assert(XMVector4LessOrEqual(Min, Max)); - -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result = XMVectorMax(Min, V); - Result = XMVectorMin(Max, Result); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR vResult; - vResult = vmaxq_f32(Min,V); - vResult = vminq_f32(vResult,Max); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult; - vResult = _mm_max_ps(Min,V); - vResult = _mm_min_ps(vResult,Max); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorSaturate -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - const XMVECTOR Zero = XMVectorZero(); - - return XMVectorClamp(V, Zero, g_XMOne.v); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Set <0 to 0 - XMVECTOR vResult = vmaxq_f32(V, vdupq_n_f32(0) ); - // Set>1 to 1 - return vminq_f32(vResult, vdupq_n_f32(1.0f) ); -#elif defined(_XM_SSE_INTRINSICS_) - // Set <0 to 0 - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - // Set>1 to 1 - return _mm_min_ps(vResult,g_XMOne); -#endif -} - -//------------------------------------------------------------------------------ -// Bitwise logical operations -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorAndInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_u32[0] = V1.vector4_u32[0] & V2.vector4_u32[0]; - Result.vector4_u32[1] = V1.vector4_u32[1] & V2.vector4_u32[1]; - Result.vector4_u32[2] = V1.vector4_u32[2] & V2.vector4_u32[2]; - Result.vector4_u32[3] = V1.vector4_u32[3] & V2.vector4_u32[3]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vandq_u32(V1,V2); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_and_ps(V1,V2); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorAndCInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_u32[0] = V1.vector4_u32[0] & ~V2.vector4_u32[0]; - Result.vector4_u32[1] = V1.vector4_u32[1] & ~V2.vector4_u32[1]; - Result.vector4_u32[2] = V1.vector4_u32[2] & ~V2.vector4_u32[2]; - Result.vector4_u32[3] = V1.vector4_u32[3] & ~V2.vector4_u32[3]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vbicq_u32(V1,V2); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_andnot_si128( _mm_castps_si128(V2), _mm_castps_si128(V1) ); - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorOrInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_u32[0] = V1.vector4_u32[0] | V2.vector4_u32[0]; - Result.vector4_u32[1] = V1.vector4_u32[1] | V2.vector4_u32[1]; - Result.vector4_u32[2] = V1.vector4_u32[2] | V2.vector4_u32[2]; - Result.vector4_u32[3] = V1.vector4_u32[3] | V2.vector4_u32[3]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vorrq_u32(V1,V2); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorNorInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_u32[0] = ~(V1.vector4_u32[0] | V2.vector4_u32[0]); - Result.vector4_u32[1] = ~(V1.vector4_u32[1] | V2.vector4_u32[1]); - Result.vector4_u32[2] = ~(V1.vector4_u32[2] | V2.vector4_u32[2]); - Result.vector4_u32[3] = ~(V1.vector4_u32[3] | V2.vector4_u32[3]); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t Result = vorrq_u32(V1,V2); - return vbicq_u32(g_XMNegOneMask, Result); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i Result; - Result = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); - Result = _mm_andnot_si128( Result,g_XMNegOneMask); - return _mm_castsi128_ps(Result); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorXorInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_u32[0] = V1.vector4_u32[0] ^ V2.vector4_u32[0]; - Result.vector4_u32[1] = V1.vector4_u32[1] ^ V2.vector4_u32[1]; - Result.vector4_u32[2] = V1.vector4_u32[2] ^ V2.vector4_u32[2]; - Result.vector4_u32[3] = V1.vector4_u32[3] ^ V2.vector4_u32[3]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return veorq_u32(V1,V2); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i V = _mm_xor_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); - return _mm_castsi128_ps(V); -#endif -} - -//------------------------------------------------------------------------------ -// Computation operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorNegate -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = -V.vector4_f32[0]; - Result.vector4_f32[1] = -V.vector4_f32[1]; - Result.vector4_f32[2] = -V.vector4_f32[2]; - Result.vector4_f32[3] = -V.vector4_f32[3]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vnegq_f32(V); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR Z; - - Z = _mm_setzero_ps(); - - return _mm_sub_ps( Z, V ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorAdd -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = V1.vector4_f32[0] + V2.vector4_f32[0]; - Result.vector4_f32[1] = V1.vector4_f32[1] + V2.vector4_f32[1]; - Result.vector4_f32[2] = V1.vector4_f32[2] + V2.vector4_f32[2]; - Result.vector4_f32[3] = V1.vector4_f32[3] + V2.vector4_f32[3]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vaddq_f32( V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_add_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorSum -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = - Result.vector4_f32[1] = - Result.vector4_f32[2] = - Result.vector4_f32[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t v1 = vget_low_f32(V); - float32x2_t v2 = vget_high_f32(V); - v1 = vadd_f32(v1, v2); - v1 = vpadd_f32(v1, v1); - return vcombine_f32(v1, v1); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vTemp = _mm_hadd_ps(V, V); - return _mm_hadd_ps(vTemp,vTemp); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1)); - XMVECTOR vTemp2 = _mm_add_ps(V, vTemp); - vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2)); - return _mm_add_ps(vTemp, vTemp2); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorAddAngles -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - const XMVECTOR Zero = XMVectorZero(); - - // Add the given angles together. If the range of V1 is such - // that -Pi <= V1 < Pi and the range of V2 is such that - // -2Pi <= V2 <= 2Pi, then the range of the resulting angle - // will be -Pi <= Result < Pi. - XMVECTOR Result = XMVectorAdd(V1, V2); - - XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); - XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); - - Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); - Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); - - Result = XMVectorAdd(Result, Offset); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Adjust the angles - XMVECTOR vResult = vaddq_f32(V1,V2); - // Less than Pi? - uint32x4_t vOffset = vcltq_f32(vResult,g_XMNegativePi); - vOffset = vandq_u32(vOffset,g_XMTwoPi); - // Add 2Pi to all entries less than -Pi - vResult = vaddq_f32(vResult,vOffset); - // Greater than or equal to Pi? - vOffset = vcgeq_f32(vResult,g_XMPi); - vOffset = vandq_u32(vOffset,g_XMTwoPi); - // Sub 2Pi to all entries greater than Pi - vResult = vsubq_f32(vResult,vOffset); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - // Adjust the angles - XMVECTOR vResult = _mm_add_ps(V1,V2); - // Less than Pi? - XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi); - vOffset = _mm_and_ps(vOffset,g_XMTwoPi); - // Add 2Pi to all entries less than -Pi - vResult = _mm_add_ps(vResult,vOffset); - // Greater than or equal to Pi? - vOffset = _mm_cmpge_ps(vResult,g_XMPi); - vOffset = _mm_and_ps(vOffset,g_XMTwoPi); - // Sub 2Pi to all entries greater than Pi - vResult = _mm_sub_ps(vResult,vOffset); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorSubtract -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = V1.vector4_f32[0] - V2.vector4_f32[0]; - Result.vector4_f32[1] = V1.vector4_f32[1] - V2.vector4_f32[1]; - Result.vector4_f32[2] = V1.vector4_f32[2] - V2.vector4_f32[2]; - Result.vector4_f32[3] = V1.vector4_f32[3] - V2.vector4_f32[3]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vsubq_f32( V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_sub_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorSubtractAngles -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - const XMVECTOR Zero = XMVectorZero(); - - // Subtract the given angles. If the range of V1 is such - // that -Pi <= V1 < Pi and the range of V2 is such that - // -2Pi <= V2 <= 2Pi, then the range of the resulting angle - // will be -Pi <= Result < Pi. - XMVECTOR Result = XMVectorSubtract(V1, V2); - - XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); - XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); - - Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); - Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); - - Result = XMVectorAdd(Result, Offset); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Adjust the angles - XMVECTOR vResult = vsubq_f32(V1,V2); - // Less than Pi? - uint32x4_t vOffset = vcltq_f32(vResult,g_XMNegativePi); - vOffset = vandq_u32(vOffset,g_XMTwoPi); - // Add 2Pi to all entries less than -Pi - vResult = vaddq_f32(vResult,vOffset); - // Greater than or equal to Pi? - vOffset = vcgeq_f32(vResult,g_XMPi); - vOffset = vandq_u32(vOffset,g_XMTwoPi); - // Sub 2Pi to all entries greater than Pi - vResult = vsubq_f32(vResult,vOffset); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - // Adjust the angles - XMVECTOR vResult = _mm_sub_ps(V1,V2); - // Less than Pi? - XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi); - vOffset = _mm_and_ps(vOffset,g_XMTwoPi); - // Add 2Pi to all entries less than -Pi - vResult = _mm_add_ps(vResult,vOffset); - // Greater than or equal to Pi? - vOffset = _mm_cmpge_ps(vResult,g_XMPi); - vOffset = _mm_and_ps(vOffset,g_XMTwoPi); - // Sub 2Pi to all entries greater than Pi - vResult = _mm_sub_ps(vResult,vOffset); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorMultiply -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = V1.vector4_f32[0] * V2.vector4_f32[0]; - Result.vector4_f32[1] = V1.vector4_f32[1] * V2.vector4_f32[1]; - Result.vector4_f32[2] = V1.vector4_f32[2] * V2.vector4_f32[2]; - Result.vector4_f32[3] = V1.vector4_f32[3] * V2.vector4_f32[3]; - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vmulq_f32( V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_mul_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR V3 -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0]; - Result.vector4_f32[1] = V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1]; - Result.vector4_f32[2] = V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2]; - Result.vector4_f32[3] = V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3]; - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vmlaq_f32( V3, V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = _mm_mul_ps( V1, V2 ); - return _mm_add_ps(vResult, V3 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorDivide -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = V1.vector4_f32[0] / V2.vector4_f32[0]; - Result.vector4_f32[1] = V1.vector4_f32[1] / V2.vector4_f32[1]; - Result.vector4_f32[2] = V1.vector4_f32[2] / V2.vector4_f32[2]; - Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3]; - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // 2 iterations of Newton-Raphson refinement of reciprocal - float32x4_t Reciprocal = vrecpeq_f32(V2); - float32x4_t S = vrecpsq_f32( Reciprocal, V2 ); - Reciprocal = vmulq_f32( S, Reciprocal ); - S = vrecpsq_f32( Reciprocal, V2 ); - Reciprocal = vmulq_f32( S, Reciprocal ); - return vmulq_f32( V1, Reciprocal ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_div_ps( V1, V2 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR V3 -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]); - Result.vector4_f32[1] = V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]); - Result.vector4_f32[2] = V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]); - Result.vector4_f32[3] = V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3]); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vmlsq_f32( V3, V1, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR R = _mm_mul_ps( V1, V2 ); - return _mm_sub_ps( V3, R ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorScale -( - FXMVECTOR V, - float ScaleFactor -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = V.vector4_f32[0] * ScaleFactor; - Result.vector4_f32[1] = V.vector4_f32[1] * ScaleFactor; - Result.vector4_f32[2] = V.vector4_f32[2] * ScaleFactor; - Result.vector4_f32[3] = V.vector4_f32[3] * ScaleFactor; - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vmulq_n_f32( V, ScaleFactor ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = _mm_set_ps1(ScaleFactor); - return _mm_mul_ps(vResult,V); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorReciprocalEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = 1.f / V.vector4_f32[0]; - Result.vector4_f32[1] = 1.f / V.vector4_f32[1]; - Result.vector4_f32[2] = 1.f / V.vector4_f32[2]; - Result.vector4_f32[3] = 1.f / V.vector4_f32[3]; - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vrecpeq_f32(V); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_rcp_ps(V); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorReciprocal -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = 1.f / V.vector4_f32[0]; - Result.vector4_f32[1] = 1.f / V.vector4_f32[1]; - Result.vector4_f32[2] = 1.f / V.vector4_f32[2]; - Result.vector4_f32[3] = 1.f / V.vector4_f32[3]; - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // 2 iterations of Newton-Raphson refinement - float32x4_t Reciprocal = vrecpeq_f32(V); - float32x4_t S = vrecpsq_f32( Reciprocal, V ); - Reciprocal = vmulq_f32( S, Reciprocal ); - S = vrecpsq_f32( Reciprocal, V ); - return vmulq_f32( S, Reciprocal ); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_div_ps(g_XMOne,V); -#endif -} - -//------------------------------------------------------------------------------ -// Return an estimated square root -inline XMVECTOR XM_CALLCONV XMVectorSqrtEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] ); - Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] ); - Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] ); - Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // 1 iteration of Newton-Raphson refinment of sqrt - float32x4_t S0 = vrsqrteq_f32(V); - float32x4_t P0 = vmulq_f32( V, S0 ); - float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); - float32x4_t S1 = vmulq_f32( S0, R0 ); - - XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); - XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) ); - XMVECTOR Result = vmulq_f32( V, S1 ); - XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); - return XMVectorSelect(V, Result, Select); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_sqrt_ps(V); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorSqrt -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] ); - Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] ); - Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] ); - Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // 3 iterations of Newton-Raphson refinment of sqrt - float32x4_t S0 = vrsqrteq_f32(V); - float32x4_t P0 = vmulq_f32( V, S0 ); - float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); - float32x4_t S1 = vmulq_f32( S0, R0 ); - float32x4_t P1 = vmulq_f32( V, S1 ); - float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); - float32x4_t S2 = vmulq_f32( S1, R1 ); - float32x4_t P2 = vmulq_f32( V, S2 ); - float32x4_t R2 = vrsqrtsq_f32( P2, S2 ); - float32x4_t S3 = vmulq_f32( S2, R2 ); - - XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); - XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) ); - XMVECTOR Result = vmulq_f32( V, S3 ); - XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); - return XMVectorSelect(V, Result, Select); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_sqrt_ps(V); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] ); - Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] ); - Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] ); - Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vrsqrteq_f32(V); -#elif defined(_XM_SSE_INTRINSICS_) - return _mm_rsqrt_ps(V); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] ); - Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] ); - Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] ); - Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // 2 iterations of Newton-Raphson refinement of reciprocal - float32x4_t S0 = vrsqrteq_f32(V); - - float32x4_t P0 = vmulq_f32( V, S0 ); - float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); - - float32x4_t S1 = vmulq_f32( S0, R0 ); - float32x4_t P1 = vmulq_f32( V, S1 ); - float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); - - return vmulq_f32( S1, R1 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = _mm_sqrt_ps(V); - vResult = _mm_div_ps(g_XMOne,vResult); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorExp2 -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = powf(2.0f, V.vector4_f32[0]); - Result.vector4_f32[1] = powf(2.0f, V.vector4_f32[1]); - Result.vector4_f32[2] = powf(2.0f, V.vector4_f32[2]); - Result.vector4_f32[3] = powf(2.0f, V.vector4_f32[3]); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - int32x4_t itrunc = vcvtq_s32_f32(V); - float32x4_t ftrunc = vcvtq_f32_s32(itrunc); - float32x4_t y = vsubq_f32(V, ftrunc); - - float32x4_t poly = vmlaq_f32( g_XMExpEst6, g_XMExpEst7, y ); - poly = vmlaq_f32( g_XMExpEst5, poly, y ); - poly = vmlaq_f32( g_XMExpEst4, poly, y ); - poly = vmlaq_f32( g_XMExpEst3, poly, y ); - poly = vmlaq_f32( g_XMExpEst2, poly, y ); - poly = vmlaq_f32( g_XMExpEst1, poly, y ); - poly = vmlaq_f32( g_XMOne, poly, y ); - - int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias); - biased = vshlq_n_s32(biased, 23); - float32x4_t result0 = XMVectorDivide(biased, poly); - - biased = vaddq_s32(itrunc, g_XM253); - biased = vshlq_n_s32(biased, 23); - float32x4_t result1 = XMVectorDivide(biased, poly); - result1 = vmulq_f32(g_XMMinNormal.v, result1); - - // Use selection to handle the cases - // if (V is NaN) -> QNaN; - // else if (V sign bit set) - // if (V > -150) - // if (V.exponent < -126) -> result1 - // else -> result0 - // else -> +0 - // else - // if (V < 128) -> result0 - // else -> +inf - - int32x4_t comp = vcltq_s32( V, g_XMBin128); - float32x4_t result2 = vbslq_f32( comp, result0, g_XMInfinity ); - - comp = vcltq_s32(itrunc, g_XMSubnormalExponent); - float32x4_t result3 = vbslq_f32( comp, result1, result0 ); - - comp = vcltq_s32(V, g_XMBinNeg150); - float32x4_t result4 = vbslq_f32( comp, result3, g_XMZero ); - - int32x4_t sign = vandq_s32(V, g_XMNegativeZero); - comp = vceqq_s32(sign, g_XMNegativeZero); - float32x4_t result5 = vbslq_f32( comp, result4, result2 ); - - int32x4_t t0 = vandq_s32(V, g_XMQNaNTest); - int32x4_t t1 = vandq_s32(V, g_XMInfinity); - t0 = vceqq_s32(t0, g_XMZero); - t1 = vceqq_s32(t1, g_XMInfinity); - int32x4_t isNaN = vbicq_s32( t1,t0); - - float32x4_t vResult = vbslq_f32( isNaN, g_XMQNaN, result5 ); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - __m128i itrunc = _mm_cvttps_epi32(V); - __m128 ftrunc = _mm_cvtepi32_ps(itrunc); - __m128 y = _mm_sub_ps(V, ftrunc); - __m128 poly = _mm_mul_ps(g_XMExpEst7, y); - poly = _mm_add_ps(g_XMExpEst6, poly); - poly = _mm_mul_ps(poly, y); - poly = _mm_add_ps(g_XMExpEst5, poly); - poly = _mm_mul_ps(poly, y); - poly = _mm_add_ps(g_XMExpEst4, poly); - poly = _mm_mul_ps(poly, y); - poly = _mm_add_ps(g_XMExpEst3, poly); - poly = _mm_mul_ps(poly, y); - poly = _mm_add_ps(g_XMExpEst2, poly); - poly = _mm_mul_ps(poly, y); - poly = _mm_add_ps(g_XMExpEst1, poly); - poly = _mm_mul_ps(poly, y); - poly = _mm_add_ps(g_XMOne, poly); - - __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias); - biased = _mm_slli_epi32(biased, 23); - __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly); - - biased = _mm_add_epi32(itrunc, g_XM253); - biased = _mm_slli_epi32(biased, 23); - __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly); - result1 = _mm_mul_ps(g_XMMinNormal.v, result1); - - // Use selection to handle the cases - // if (V is NaN) -> QNaN; - // else if (V sign bit set) - // if (V > -150) - // if (V.exponent < -126) -> result1 - // else -> result0 - // else -> +0 - // else - // if (V < 128) -> result0 - // else -> +inf - - __m128i comp = _mm_cmplt_epi32( _mm_castps_si128(V), g_XMBin128); - __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0)); - __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity); - __m128i result2 = _mm_or_si128(select0, select1); - - comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent); - select1 = _mm_and_si128(comp, _mm_castps_si128(result1)); - select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0)); - __m128i result3 = _mm_or_si128(select0, select1); - - comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBinNeg150); - select0 = _mm_and_si128(comp, result3); - select1 = _mm_andnot_si128(comp, g_XMZero); - __m128i result4 = _mm_or_si128(select0, select1); - - __m128i sign = _mm_and_si128(_mm_castps_si128(V), g_XMNegativeZero); - comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero); - select0 = _mm_and_si128(comp, result4); - select1 = _mm_andnot_si128(comp, result2); - __m128i result5 = _mm_or_si128(select0, select1); - - __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); - __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); - t0 = _mm_cmpeq_epi32(t0, g_XMZero); - t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); - __m128i isNaN = _mm_andnot_si128(t0, t1); - - select0 = _mm_and_si128(isNaN, g_XMQNaN); - select1 = _mm_andnot_si128(isNaN, result5); - __m128i vResult = _mm_or_si128(select0, select1); - - return _mm_castsi128_ps(vResult); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorExpE -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = expf(V.vector4_f32[0]); - Result.vector4_f32[1] = expf(V.vector4_f32[1]); - Result.vector4_f32[2] = expf(V.vector4_f32[2]); - Result.vector4_f32[3] = expf(V.vector4_f32[3]); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // expE(V) = exp2(vin*log2(e)) - float32x4_t Ve = vmulq_f32(g_XMLgE, V); - - int32x4_t itrunc = vcvtq_s32_f32(Ve); - float32x4_t ftrunc = vcvtq_f32_s32(itrunc); - float32x4_t y = vsubq_f32(Ve, ftrunc); - - - float32x4_t poly = vmlaq_f32( g_XMExpEst6, g_XMExpEst7, y ); - poly = vmlaq_f32( g_XMExpEst5, poly, y ); - poly = vmlaq_f32( g_XMExpEst4, poly, y ); - poly = vmlaq_f32( g_XMExpEst3, poly, y ); - poly = vmlaq_f32( g_XMExpEst2, poly, y ); - poly = vmlaq_f32( g_XMExpEst1, poly, y ); - poly = vmlaq_f32( g_XMOne, poly, y ); - - int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias); - biased = vshlq_n_s32(biased, 23); - float32x4_t result0 = XMVectorDivide(biased, poly); - - biased = vaddq_s32(itrunc, g_XM253); - biased = vshlq_n_s32(biased, 23); - float32x4_t result1 = XMVectorDivide(biased, poly); - result1 = vmulq_f32(g_XMMinNormal.v, result1); - - // Use selection to handle the cases - // if (V is NaN) -> QNaN; - // else if (V sign bit set) - // if (V > -150) - // if (V.exponent < -126) -> result1 - // else -> result0 - // else -> +0 - // else - // if (V < 128) -> result0 - // else -> +inf - - int32x4_t comp = vcltq_s32( Ve, g_XMBin128); - float32x4_t result2 = vbslq_f32( comp, result0, g_XMInfinity ); - - comp = vcltq_s32(itrunc, g_XMSubnormalExponent); - float32x4_t result3 = vbslq_f32( comp, result1, result0 ); - - comp = vcltq_s32(Ve, g_XMBinNeg150); - float32x4_t result4 = vbslq_f32( comp, result3, g_XMZero ); - - int32x4_t sign = vandq_s32(Ve, g_XMNegativeZero); - comp = vceqq_s32(sign, g_XMNegativeZero); - float32x4_t result5 = vbslq_f32( comp, result4, result2 ); - - int32x4_t t0 = vandq_s32(Ve, g_XMQNaNTest); - int32x4_t t1 = vandq_s32(Ve, g_XMInfinity); - t0 = vceqq_s32(t0, g_XMZero); - t1 = vceqq_s32(t1, g_XMInfinity); - int32x4_t isNaN = vbicq_s32( t1,t0); - - float32x4_t vResult = vbslq_f32( isNaN, g_XMQNaN, result5 ); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - // expE(V) = exp2(vin*log2(e)) - __m128 Ve = _mm_mul_ps(g_XMLgE, V); - - __m128i itrunc = _mm_cvttps_epi32(Ve); - __m128 ftrunc = _mm_cvtepi32_ps(itrunc); - __m128 y = _mm_sub_ps(Ve, ftrunc); - __m128 poly = _mm_mul_ps(g_XMExpEst7, y); - poly = _mm_add_ps(g_XMExpEst6, poly); - poly = _mm_mul_ps(poly, y); - poly = _mm_add_ps(g_XMExpEst5, poly); - poly = _mm_mul_ps(poly, y); - poly = _mm_add_ps(g_XMExpEst4, poly); - poly = _mm_mul_ps(poly, y); - poly = _mm_add_ps(g_XMExpEst3, poly); - poly = _mm_mul_ps(poly, y); - poly = _mm_add_ps(g_XMExpEst2, poly); - poly = _mm_mul_ps(poly, y); - poly = _mm_add_ps(g_XMExpEst1, poly); - poly = _mm_mul_ps(poly, y); - poly = _mm_add_ps(g_XMOne, poly); - - __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias); - biased = _mm_slli_epi32(biased, 23); - __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly); - - biased = _mm_add_epi32(itrunc, g_XM253); - biased = _mm_slli_epi32(biased, 23); - __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly); - result1 = _mm_mul_ps(g_XMMinNormal.v, result1); - - // Use selection to handle the cases - // if (V is NaN) -> QNaN; - // else if (V sign bit set) - // if (V > -150) - // if (V.exponent < -126) -> result1 - // else -> result0 - // else -> +0 - // else - // if (V < 128) -> result0 - // else -> +inf - - __m128i comp = _mm_cmplt_epi32( _mm_castps_si128(Ve), g_XMBin128); - __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0)); - __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity); - __m128i result2 = _mm_or_si128(select0, select1); - - comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent); - select1 = _mm_and_si128(comp, _mm_castps_si128(result1)); - select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0)); - __m128i result3 = _mm_or_si128(select0, select1); - - comp = _mm_cmplt_epi32(_mm_castps_si128(Ve), g_XMBinNeg150); - select0 = _mm_and_si128(comp, result3); - select1 = _mm_andnot_si128(comp, g_XMZero); - __m128i result4 = _mm_or_si128(select0, select1); - - __m128i sign = _mm_and_si128(_mm_castps_si128(Ve), g_XMNegativeZero); - comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero); - select0 = _mm_and_si128(comp, result4); - select1 = _mm_andnot_si128(comp, result2); - __m128i result5 = _mm_or_si128(select0, select1); - - __m128i t0 = _mm_and_si128(_mm_castps_si128(Ve), g_XMQNaNTest); - __m128i t1 = _mm_and_si128(_mm_castps_si128(Ve), g_XMInfinity); - t0 = _mm_cmpeq_epi32(t0, g_XMZero); - t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); - __m128i isNaN = _mm_andnot_si128(t0, t1); - - select0 = _mm_and_si128(isNaN, g_XMQNaN); - select1 = _mm_andnot_si128(isNaN, result5); - __m128i vResult = _mm_or_si128(select0, select1); - - return _mm_castsi128_ps(vResult); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorExp -( - FXMVECTOR V -) -{ - return XMVectorExp2(V); -} - -//------------------------------------------------------------------------------ - -#if defined(_XM_SSE_INTRINSICS_) - -namespace Internal -{ - inline __m128i multi_sll_epi32(__m128i value, __m128i count) - { - __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0,0,0,0)); - __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0,0,0,0)); - c = _mm_and_si128(c, g_XMMaskX); - __m128i r0 = _mm_sll_epi32(v, c); - - v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1,1,1,1)); - c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1,1,1,1)); - c = _mm_and_si128(c, g_XMMaskX); - __m128i r1 = _mm_sll_epi32(v, c); - - v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2,2,2,2)); - c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2,2,2,2)); - c = _mm_and_si128(c, g_XMMaskX); - __m128i r2 = _mm_sll_epi32(v, c); - - v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3,3,3,3)); - c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3,3,3,3)); - c = _mm_and_si128(c, g_XMMaskX); - __m128i r3 = _mm_sll_epi32(v, c); - - // (r0,r0,r1,r1) - __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0,0,0,0)); - // (r2,r2,r3,r3) - __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0,0,0,0)); - // (r0,r1,r2,r3) - __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2,0,2,0)); - return _mm_castps_si128(result); - } - - inline __m128i multi_srl_epi32(__m128i value, __m128i count) - { - __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0,0,0,0)); - __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0,0,0,0)); - c = _mm_and_si128(c, g_XMMaskX); - __m128i r0 = _mm_srl_epi32(v, c); - - v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1,1,1,1)); - c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1,1,1,1)); - c = _mm_and_si128(c, g_XMMaskX); - __m128i r1 = _mm_srl_epi32(v, c); - - v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2,2,2,2)); - c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2,2,2,2)); - c = _mm_and_si128(c, g_XMMaskX); - __m128i r2 = _mm_srl_epi32(v, c); - - v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3,3,3,3)); - c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3,3,3,3)); - c = _mm_and_si128(c, g_XMMaskX); - __m128i r3 = _mm_srl_epi32(v, c); - - // (r0,r0,r1,r1) - __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0,0,0,0)); - // (r2,r2,r3,r3) - __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0,0,0,0)); - // (r0,r1,r2,r3) - __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2,0,2,0)); - return _mm_castps_si128(result); - } - - inline __m128i GetLeadingBit(const __m128i value) - { - static const XMVECTORI32 g_XM0000FFFF = {0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF}; - static const XMVECTORI32 g_XM000000FF = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF}; - static const XMVECTORI32 g_XM0000000F = {0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F}; - static const XMVECTORI32 g_XM00000003 = {0x00000003, 0x00000003, 0x00000003, 0x00000003}; - - __m128i v = value, r, c, b, s; - - c = _mm_cmpgt_epi32(v, g_XM0000FFFF); // c = (v > 0xFFFF) - b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) - r = _mm_slli_epi32(b, 4); // r = (b << 4) - v = multi_srl_epi32(v, r); // v = (v >> r) - - c = _mm_cmpgt_epi32(v, g_XM000000FF); // c = (v > 0xFF) - b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) - s = _mm_slli_epi32(b, 3); // s = (b << 3) - v = multi_srl_epi32(v, s); // v = (v >> s) - r = _mm_or_si128(r, s); // r = (r | s) - - c = _mm_cmpgt_epi32(v, g_XM0000000F); // c = (v > 0xF) - b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) - s = _mm_slli_epi32(b, 2); // s = (b << 2) - v = multi_srl_epi32(v, s); // v = (v >> s) - r = _mm_or_si128(r, s); // r = (r | s) - - c = _mm_cmpgt_epi32(v, g_XM00000003); // c = (v > 0x3) - b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) - s = _mm_slli_epi32(b, 1); // s = (b << 1) - v = multi_srl_epi32(v, s); // v = (v >> s) - r = _mm_or_si128(r, s); // r = (r | s) - - s = _mm_srli_epi32(v, 1); - r = _mm_or_si128(r, s); - return r; - } -} // namespace Internal - -#endif // _XM_SSE_INTRINSICS_ - -#if defined(_XM_ARM_NEON_INTRINSICS_) - -namespace Internal -{ - inline int32x4_t GetLeadingBit(const int32x4_t value) - { - static const XMVECTORI32 g_XM0000FFFF = {0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF}; - static const XMVECTORI32 g_XM000000FF = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF}; - static const XMVECTORI32 g_XM0000000F = {0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F}; - static const XMVECTORI32 g_XM00000003 = {0x00000003, 0x00000003, 0x00000003, 0x00000003}; - - int32x4_t v = value, r, c, b, s; - - c = vcgtq_s32(v, g_XM0000FFFF); // c = (v > 0xFFFF) - b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0) - r = vshlq_n_s32(b, 4); // r = (b << 4) - r = vnegq_s32( r ); - v = vshlq_u32( v, r ); // v = (v >> r) - - c = vcgtq_s32(v, g_XM000000FF); // c = (v > 0xFF) - b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0) - s = vshlq_n_s32(b, 3); // s = (b << 3) - s = vnegq_s32( s ); - v = vshlq_u32(v, s); // v = (v >> s) - r = vorrq_s32(r, s); // r = (r | s) - - c = vcgtq_s32(v, g_XM0000000F); // c = (v > 0xF) - b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0) - s = vshlq_n_s32(b, 2); // s = (b << 2) - s = vnegq_s32( s ); - v = vshlq_u32(v, s); // v = (v >> s) - r = vorrq_s32(r, s); // r = (r | s) - - c = vcgtq_s32(v, g_XM00000003); // c = (v > 0x3) - b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0) - s = vshlq_n_s32(b, 1); // s = (b << 1) - s = vnegq_s32( s ); - v = vshlq_u32(v, s); // v = (v >> s) - r = vorrq_s32(r, s); // r = (r | s) - - s = vshrq_n_u32(v, 1); - r = vorrq_s32(r, s); - return r; - } - -} // namespace Internal - -#endif - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorLog2 -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - const float fScale = 1.4426950f; // (1.0f / logf(2.0f)); - - XMVECTOR Result; - Result.vector4_f32[0] = logf(V.vector4_f32[0])*fScale; - Result.vector4_f32[1] = logf(V.vector4_f32[1])*fScale; - Result.vector4_f32[2] = logf(V.vector4_f32[2])*fScale; - Result.vector4_f32[3] = logf(V.vector4_f32[3])*fScale; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - int32x4_t rawBiased = vandq_s32(V, g_XMInfinity); - int32x4_t trailing = vandq_s32(V, g_XMQNaNTest); - int32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased); - - // Compute exponent and significand for normals. - int32x4_t biased = vshrq_n_u32(rawBiased, 23); - int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); - int32x4_t trailingNor = trailing; - - // Compute exponent and significand for subnormals. - int32x4_t leading = Internal::GetLeadingBit(trailing); - int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); - int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); - int32x4_t trailingSub = vshlq_u32(trailing, shift); - trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); - int32x4_t e = vbslq_f32( isExponentZero, exponentSub, exponentNor ); - int32x4_t t = vbslq_f32( isExponentZero, trailingSub, trailingNor ); - - // Compute the approximation. - int32x4_t tmp = vorrq_s32(g_XMOne, t); - float32x4_t y = vsubq_f32(tmp, g_XMOne); - - float32x4_t log2 = vmlaq_f32( g_XMLogEst6, g_XMLogEst7, y ); - log2 = vmlaq_f32( g_XMLogEst5, log2, y ); - log2 = vmlaq_f32( g_XMLogEst4, log2, y ); - log2 = vmlaq_f32( g_XMLogEst3, log2, y ); - log2 = vmlaq_f32( g_XMLogEst2, log2, y ); - log2 = vmlaq_f32( g_XMLogEst1, log2, y ); - log2 = vmlaq_f32( g_XMLogEst0, log2, y ); - log2 = vmlaq_f32( vcvtq_f32_s32(e), log2, y ); - - // if (x is NaN) -> QNaN - // else if (V is positive) - // if (V is infinite) -> +inf - // else -> log2(V) - // else - // if (V is zero) -> -inf - // else -> -QNaN - - int32x4_t isInfinite = vandq_s32((V), g_XMAbsMask); - isInfinite = vceqq_s32(isInfinite, g_XMInfinity); - - int32x4_t isGreaterZero = vcgtq_s32((V), g_XMZero); - int32x4_t isNotFinite = vcgtq_s32((V), g_XMInfinity); - int32x4_t isPositive = vbicq_s32( isGreaterZero,isNotFinite); - - int32x4_t isZero = vandq_s32((V), g_XMAbsMask); - isZero = vceqq_s32(isZero, g_XMZero); - - int32x4_t t0 = vandq_s32((V), g_XMQNaNTest); - int32x4_t t1 = vandq_s32((V), g_XMInfinity); - t0 = vceqq_s32(t0, g_XMZero); - t1 = vceqq_s32(t1, g_XMInfinity); - int32x4_t isNaN = vbicq_s32( t1,t0); - - float32x4_t result = vbslq_f32( isInfinite, g_XMInfinity, log2 ); - tmp = vbslq_f32( isZero, g_XMNegInfinity, g_XMNegQNaN ); - result = vbslq_f32(isPositive, result, tmp); - result = vbslq_f32(isNaN, g_XMQNaN, result ); - return result; -#elif defined(_XM_SSE_INTRINSICS_) - __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); - __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); - __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); - - // Compute exponent and significand for normals. - __m128i biased = _mm_srli_epi32(rawBiased, 23); - __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); - __m128i trailingNor = trailing; - - // Compute exponent and significand for subnormals. - __m128i leading = Internal::GetLeadingBit(trailing); - __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); - __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); - __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift); - trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); - - __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); - __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); - __m128i e = _mm_or_si128(select0, select1); - - select0 = _mm_and_si128(isExponentZero, trailingSub); - select1 = _mm_andnot_si128(isExponentZero, trailingNor); - __m128i t = _mm_or_si128(select0, select1); - - // Compute the approximation. - __m128i tmp = _mm_or_si128(g_XMOne, t); - __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); - - __m128 log2 = _mm_mul_ps(g_XMLogEst7, y); - log2 = _mm_add_ps(g_XMLogEst6, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(g_XMLogEst5, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(g_XMLogEst4, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(g_XMLogEst3, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(g_XMLogEst2, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(g_XMLogEst1, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(g_XMLogEst0, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(e)); - - // if (x is NaN) -> QNaN - // else if (V is positive) - // if (V is infinite) -> +inf - // else -> log2(V) - // else - // if (V is zero) -> -inf - // else -> -QNaN - - __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); - isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); - - __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); - __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); - __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); - - __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); - isZero = _mm_cmpeq_epi32(isZero, g_XMZero); - - __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); - __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); - t0 = _mm_cmpeq_epi32(t0, g_XMZero); - t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); - __m128i isNaN = _mm_andnot_si128(t0, t1); - - select0 = _mm_and_si128(isInfinite, g_XMInfinity); - select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); - __m128i result = _mm_or_si128(select0, select1); - - select0 = _mm_and_si128(isZero, g_XMNegInfinity); - select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); - tmp = _mm_or_si128(select0, select1); - - select0 = _mm_and_si128(isPositive, result); - select1 = _mm_andnot_si128(isPositive, tmp); - result = _mm_or_si128(select0, select1); - - select0 = _mm_and_si128(isNaN, g_XMQNaN); - select1 = _mm_andnot_si128(isNaN, result); - result = _mm_or_si128(select0, select1); - - return _mm_castsi128_ps(result); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorLogE -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = logf(V.vector4_f32[0]); - Result.vector4_f32[1] = logf(V.vector4_f32[1]); - Result.vector4_f32[2] = logf(V.vector4_f32[2]); - Result.vector4_f32[3] = logf(V.vector4_f32[3]); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - int32x4_t rawBiased = vandq_s32(V, g_XMInfinity); - int32x4_t trailing = vandq_s32(V, g_XMQNaNTest); - int32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased); - - // Compute exponent and significand for normals. - int32x4_t biased = vshrq_n_u32(rawBiased, 23); - int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); - int32x4_t trailingNor = trailing; - - // Compute exponent and significand for subnormals. - int32x4_t leading = Internal::GetLeadingBit(trailing); - int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); - int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); - int32x4_t trailingSub = vshlq_u32(trailing, shift); - trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); - int32x4_t e = vbslq_f32( isExponentZero, exponentSub, exponentNor ); - int32x4_t t = vbslq_f32( isExponentZero, trailingSub, trailingNor ); - - // Compute the approximation. - int32x4_t tmp = vorrq_s32(g_XMOne, t); - float32x4_t y = vsubq_f32(tmp, g_XMOne); - - float32x4_t log2 = vmlaq_f32( g_XMLogEst6, g_XMLogEst7, y ); - log2 = vmlaq_f32( g_XMLogEst5, log2, y ); - log2 = vmlaq_f32( g_XMLogEst4, log2, y ); - log2 = vmlaq_f32( g_XMLogEst3, log2, y ); - log2 = vmlaq_f32( g_XMLogEst2, log2, y ); - log2 = vmlaq_f32( g_XMLogEst1, log2, y ); - log2 = vmlaq_f32( g_XMLogEst0, log2, y ); - log2 = vmlaq_f32( vcvtq_f32_s32(e), log2, y ); - - log2 = vmulq_f32(g_XMInvLgE, log2); - - // if (x is NaN) -> QNaN - // else if (V is positive) - // if (V is infinite) -> +inf - // else -> log2(V) - // else - // if (V is zero) -> -inf - // else -> -QNaN - - int32x4_t isInfinite = vandq_s32((V), g_XMAbsMask); - isInfinite = vceqq_s32(isInfinite, g_XMInfinity); - - int32x4_t isGreaterZero = vcgtq_s32((V), g_XMZero); - int32x4_t isNotFinite = vcgtq_s32((V), g_XMInfinity); - int32x4_t isPositive = vbicq_s32( isGreaterZero,isNotFinite); - - int32x4_t isZero = vandq_s32((V), g_XMAbsMask); - isZero = vceqq_s32(isZero, g_XMZero); - - int32x4_t t0 = vandq_s32((V), g_XMQNaNTest); - int32x4_t t1 = vandq_s32((V), g_XMInfinity); - t0 = vceqq_s32(t0, g_XMZero); - t1 = vceqq_s32(t1, g_XMInfinity); - int32x4_t isNaN = vbicq_s32( t1,t0); - - float32x4_t result = vbslq_f32( isInfinite, g_XMInfinity, log2 ); - tmp = vbslq_f32( isZero, g_XMNegInfinity, g_XMNegQNaN ); - result = vbslq_f32(isPositive, result, tmp); - result = vbslq_f32(isNaN, g_XMQNaN, result ); - return result; -#elif defined(_XM_SSE_INTRINSICS_) - __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); - __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); - __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); - - // Compute exponent and significand for normals. - __m128i biased = _mm_srli_epi32(rawBiased, 23); - __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); - __m128i trailingNor = trailing; - - // Compute exponent and significand for subnormals. - __m128i leading = Internal::GetLeadingBit(trailing); - __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); - __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); - __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift); - trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); - - __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); - __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); - __m128i e = _mm_or_si128(select0, select1); - - select0 = _mm_and_si128(isExponentZero, trailingSub); - select1 = _mm_andnot_si128(isExponentZero, trailingNor); - __m128i t = _mm_or_si128(select0, select1); - - // Compute the approximation. - __m128i tmp = _mm_or_si128(g_XMOne, t); - __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); - - __m128 log2 = _mm_mul_ps(g_XMLogEst7, y); - log2 = _mm_add_ps(g_XMLogEst6, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(g_XMLogEst5, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(g_XMLogEst4, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(g_XMLogEst3, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(g_XMLogEst2, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(g_XMLogEst1, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(g_XMLogEst0, log2); - log2 = _mm_mul_ps(log2, y); - log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(e)); - - log2 = _mm_mul_ps(g_XMInvLgE, log2); - - // if (x is NaN) -> QNaN - // else if (V is positive) - // if (V is infinite) -> +inf - // else -> log2(V) - // else - // if (V is zero) -> -inf - // else -> -QNaN - - __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); - isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); - - __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); - __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); - __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); - - __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); - isZero = _mm_cmpeq_epi32(isZero, g_XMZero); - - __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); - __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); - t0 = _mm_cmpeq_epi32(t0, g_XMZero); - t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); - __m128i isNaN = _mm_andnot_si128(t0, t1); - - select0 = _mm_and_si128(isInfinite, g_XMInfinity); - select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); - __m128i result = _mm_or_si128(select0, select1); - - select0 = _mm_and_si128(isZero, g_XMNegInfinity); - select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); - tmp = _mm_or_si128(select0, select1); - - select0 = _mm_and_si128(isPositive, result); - select1 = _mm_andnot_si128(isPositive, tmp); - result = _mm_or_si128(select0, select1); - - select0 = _mm_and_si128(isNaN, g_XMQNaN); - select1 = _mm_andnot_si128(isNaN, result); - result = _mm_or_si128(select0, select1); - - return _mm_castsi128_ps(result); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorLog -( - FXMVECTOR V -) -{ - return XMVectorLog2(V); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorPow -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = powf(V1.vector4_f32[0], V2.vector4_f32[0]); - Result.vector4_f32[1] = powf(V1.vector4_f32[1], V2.vector4_f32[1]); - Result.vector4_f32[2] = powf(V1.vector4_f32[2], V2.vector4_f32[2]); - Result.vector4_f32[3] = powf(V1.vector4_f32[3], V2.vector4_f32[3]); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTORF32 vResult = { - powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)), - powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)), - powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)), - powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3)) - }; - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - __declspec(align(16)) float a[4]; - __declspec(align(16)) float b[4]; - _mm_store_ps( a, V1 ); - _mm_store_ps( b, V2 ); - XMVECTOR vResult = _mm_setr_ps( - powf(a[0],b[0]), - powf(a[1],b[1]), - powf(a[2],b[2]), - powf(a[3],b[3])); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorAbs -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult; - vResult.vector4_f32[0] = fabsf(V.vector4_f32[0]); - vResult.vector4_f32[1] = fabsf(V.vector4_f32[1]); - vResult.vector4_f32[2] = fabsf(V.vector4_f32[2]); - vResult.vector4_f32[3] = fabsf(V.vector4_f32[3]); - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - return vabsq_f32( V ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = _mm_setzero_ps(); - vResult = _mm_sub_ps(vResult,V); - vResult = _mm_max_ps(vResult,V); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorMod -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - // V1 % V2 = V1 - V2 * truncate(V1 / V2) - -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Quotient = XMVectorDivide(V1, V2); - Quotient = XMVectorTruncate(Quotient); - XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR vResult = XMVectorDivide(V1, V2); - vResult = XMVectorTruncate(vResult); - return vmlsq_f32( V1, vResult, V2 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = _mm_div_ps(V1, V2); - vResult = XMVectorTruncate(vResult); - vResult = _mm_mul_ps(vResult,V2); - vResult = _mm_sub_ps(V1,vResult); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorModAngles -( - FXMVECTOR Angles -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR V; - XMVECTOR Result; - - // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI - V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v); - V = XMVectorRound(V); - Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI - XMVECTOR vResult = vmulq_f32(Angles,g_XMReciprocalTwoPi); - // Use the inline function due to complexity for rounding - vResult = XMVectorRound(vResult); - return vmlsq_f32( Angles, vResult, g_XMTwoPi ); -#elif defined(_XM_SSE_INTRINSICS_) - // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI - XMVECTOR vResult = _mm_mul_ps(Angles,g_XMReciprocalTwoPi); - // Use the inline function due to complexity for rounding - vResult = XMVectorRound(vResult); - vResult = _mm_mul_ps(vResult,g_XMTwoPi); - vResult = _mm_sub_ps(Angles,vResult); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorSin -( - FXMVECTOR V -) -{ - // 11-degree minimax approximation - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = sinf( V.vector4_f32[0] ); - Result.vector4_f32[1] = sinf( V.vector4_f32[1] ); - Result.vector4_f32[2] = sinf( V.vector4_f32[2] ); - Result.vector4_f32[3] = sinf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Force the value within the bounds of pi - XMVECTOR x = XMVectorModAngles(V); - - // Map in [-pi/2,pi/2] with sin(y) = sin(x). - uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); - uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 - float32x4_t absx = vabsq_f32( x ); - float32x4_t rflx = vsubq_f32(c, x); - uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); - x = vbslq_f32( comp, x, rflx ); - - float32x4_t x2 = vmulq_f32(x, x); - - // Compute polynomial approximation - const XMVECTOR SC1 = g_XMSinCoefficients1; - const XMVECTOR SC0 = g_XMSinCoefficients0; - XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); - XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0); - - vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); - Result = vmlaq_f32(vConstants, Result, x2); - - vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); - Result = vmlaq_f32(vConstants, Result, x2); - - vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); - Result = vmlaq_f32(vConstants, Result, x2); - - Result = vmlaq_f32(g_XMOne, Result, x2); - Result = vmulq_f32(Result, x); - return Result; -#elif defined(_XM_SSE_INTRINSICS_) - // Force the value within the bounds of pi - XMVECTOR x = XMVectorModAngles(V); - - // Map in [-pi/2,pi/2] with sin(y) = sin(x). - __m128 sign = _mm_and_ps(x, g_XMNegativeZero); - __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 - __m128 absx = _mm_andnot_ps(sign, x); // |x| - __m128 rflx = _mm_sub_ps(c, x); - __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); - __m128 select0 = _mm_and_ps(comp, x); - __m128 select1 = _mm_andnot_ps(comp, rflx); - x = _mm_or_ps(select0, select1); - - __m128 x2 = _mm_mul_ps(x, x); - - // Compute polynomial approximation - const XMVECTOR SC1 = g_XMSinCoefficients1; - XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) ); - __m128 Result = _mm_mul_ps(vConstants, x2); - - const XMVECTOR SC0 = g_XMSinCoefficients0; - vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - Result = _mm_add_ps(Result, g_XMOne); - Result = _mm_mul_ps(Result, x); - return Result; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorCos -( - FXMVECTOR V -) -{ - // 10-degree minimax approximation - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = cosf( V.vector4_f32[0] ); - Result.vector4_f32[1] = cosf( V.vector4_f32[1] ); - Result.vector4_f32[2] = cosf( V.vector4_f32[2] ); - Result.vector4_f32[3] = cosf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Map V to x in [-pi,pi]. - XMVECTOR x = XMVectorModAngles(V); - - // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). - uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); - uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 - float32x4_t absx = vabsq_f32( x ); - float32x4_t rflx = vsubq_f32(c, x); - uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); - x = vbslq_f32( comp, x, rflx ); - sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); - - float32x4_t x2 = vmulq_f32(x, x); - - // Compute polynomial approximation - const XMVECTOR CC1 = g_XMCosCoefficients1; - const XMVECTOR CC0 = g_XMCosCoefficients0; - XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); - XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0 ); - - vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); - Result = vmlaq_f32(vConstants, Result, x2); - - vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); - Result = vmlaq_f32(vConstants, Result, x2); - - vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); - Result = vmlaq_f32(vConstants, Result, x2); - - Result = vmlaq_f32(g_XMOne, Result, x2); - Result = vmulq_f32(Result, sign); - return Result; -#elif defined(_XM_SSE_INTRINSICS_) - // Map V to x in [-pi,pi]. - XMVECTOR x = XMVectorModAngles(V); - - // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). - XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); - __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 - __m128 absx = _mm_andnot_ps(sign, x); // |x| - __m128 rflx = _mm_sub_ps(c, x); - __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); - __m128 select0 = _mm_and_ps(comp, x); - __m128 select1 = _mm_andnot_ps(comp, rflx); - x = _mm_or_ps(select0, select1); - select0 = _mm_and_ps(comp, g_XMOne); - select1 = _mm_andnot_ps(comp, g_XMNegativeOne); - sign = _mm_or_ps(select0, select1); - - __m128 x2 = _mm_mul_ps(x, x); - - // Compute polynomial approximation - const XMVECTOR CC1 = g_XMCosCoefficients1; - XMVECTOR vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) ); - __m128 Result = _mm_mul_ps(vConstants, x2); - - const XMVECTOR CC0 = g_XMCosCoefficients0; - vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - Result = _mm_add_ps(Result, g_XMOne); - Result = _mm_mul_ps(Result, sign); - return Result; -#endif -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline void XM_CALLCONV XMVectorSinCos -( - XMVECTOR* pSin, - XMVECTOR* pCos, - FXMVECTOR V -) -{ - assert(pSin != nullptr); - assert(pCos != nullptr); - - // 11/10-degree minimax approximation - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Sin; - Sin.vector4_f32[0] = sinf( V.vector4_f32[0] ); - Sin.vector4_f32[1] = sinf( V.vector4_f32[1] ); - Sin.vector4_f32[2] = sinf( V.vector4_f32[2] ); - Sin.vector4_f32[3] = sinf( V.vector4_f32[3] ); - - XMVECTOR Cos; - Cos.vector4_f32[0] = cosf( V.vector4_f32[0] ); - Cos.vector4_f32[1] = cosf( V.vector4_f32[1] ); - Cos.vector4_f32[2] = cosf( V.vector4_f32[2] ); - Cos.vector4_f32[3] = cosf( V.vector4_f32[3] ); - - *pSin = Sin; - *pCos = Cos; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Force the value within the bounds of pi - XMVECTOR x = XMVectorModAngles(V); - - // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). - uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); - uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 - float32x4_t absx = vabsq_f32( x ); - float32x4_t rflx = vsubq_f32(c, x); - uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); - x = vbslq_f32( comp, x, rflx ); - sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); - - float32x4_t x2 = vmulq_f32(x, x); - - // Compute polynomial approximation for sine - const XMVECTOR SC1 = g_XMSinCoefficients1; - const XMVECTOR SC0 = g_XMSinCoefficients0; - XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); - XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0); - - vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); - Result = vmlaq_f32(vConstants, Result, x2); - - vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); - Result = vmlaq_f32(vConstants, Result, x2); - - vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); - Result = vmlaq_f32(vConstants, Result, x2); - - Result = vmlaq_f32(g_XMOne, Result, x2); - *pSin = vmulq_f32(Result, x); - - // Compute polynomial approximation for cosine - const XMVECTOR CC1 = g_XMCosCoefficients1; - const XMVECTOR CC0 = g_XMCosCoefficients0; - vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); - Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0); - - vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); - Result = vmlaq_f32(vConstants, Result, x2); - - vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); - Result = vmlaq_f32(vConstants, Result, x2); - - vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); - Result = vmlaq_f32(vConstants, Result, x2); - - Result = vmlaq_f32(g_XMOne, Result, x2); - *pCos = vmulq_f32(Result, sign); -#elif defined(_XM_SSE_INTRINSICS_) - // Force the value within the bounds of pi - XMVECTOR x = XMVectorModAngles(V); - - // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). - XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); - __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 - __m128 absx = _mm_andnot_ps(sign, x); // |x| - __m128 rflx = _mm_sub_ps(c, x); - __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); - __m128 select0 = _mm_and_ps(comp, x); - __m128 select1 = _mm_andnot_ps(comp, rflx); - x = _mm_or_ps(select0, select1); - select0 = _mm_and_ps(comp, g_XMOne); - select1 = _mm_andnot_ps(comp, g_XMNegativeOne); - sign = _mm_or_ps(select0, select1); - - __m128 x2 = _mm_mul_ps(x, x); - - // Compute polynomial approximation of sine - const XMVECTOR SC1 = g_XMSinCoefficients1; - XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) ); - __m128 Result = _mm_mul_ps(vConstants, x2); - - const XMVECTOR SC0 = g_XMSinCoefficients0; - vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - Result = _mm_add_ps(Result, g_XMOne); - Result = _mm_mul_ps(Result, x); - *pSin = Result; - - // Compute polynomial approximation of cosine - const XMVECTOR CC1 = g_XMCosCoefficients1; - vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) ); - Result = _mm_mul_ps(vConstants, x2); - - const XMVECTOR CC0 = g_XMCosCoefficients0; - vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - Result = _mm_add_ps(Result, g_XMOne); - Result = _mm_mul_ps(Result, sign); - *pCos = Result; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorTan -( - FXMVECTOR V -) -{ - // Cody and Waite algorithm to compute tangent. - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = tanf( V.vector4_f32[0] ); - Result.vector4_f32[1] = tanf( V.vector4_f32[1] ); - Result.vector4_f32[2] = tanf( V.vector4_f32[2] ); - Result.vector4_f32[3] = tanf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) - - static const XMVECTORF32 TanCoefficients0 = {1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f}; - static const XMVECTORF32 TanCoefficients1 = {4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f}; - static const XMVECTORF32 TanConstants = {1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ }; - static const XMVECTORU32 Mask = {0x1, 0x1, 0x1, 0x1}; - - XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v); - - XMVECTOR Zero = XMVectorZero(); - - XMVECTOR C0 = XMVectorSplatX(TanConstants.v); - XMVECTOR C1 = XMVectorSplatY(TanConstants.v); - XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v); - - XMVECTOR VA = XMVectorMultiply(V, TwoDivPi); - - VA = XMVectorRound(VA); - - XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V); - - XMVECTOR VB = XMVectorAbs(VA); - - VC = XMVectorNegativeMultiplySubtract(VA, C1, VC); - -#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - VB = vcvtq_u32_f32( VB ); -#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - reinterpret_cast<__m128i *>(&VB)[0] = _mm_cvttps_epi32(VB); -#else - for (size_t i = 0; i < 4; i++) - { - VB.vector4_u32[i] = (uint32_t)VB.vector4_f32[i]; - } -#endif - - XMVECTOR VC2 = XMVectorMultiply(VC, VC); - - XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v); - XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v); - XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v); - XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v); - XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v); - XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v); - XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v); - XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v); - - XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v); - VBIsEven = XMVectorEqualInt(VBIsEven, Zero); - - XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6); - XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3); - N = XMVectorMultiplyAdd(VC2, N, T5); - D = XMVectorMultiplyAdd(VC2, D, T2); - N = XMVectorMultiply(VC2, N); - D = XMVectorMultiplyAdd(VC2, D, T1); - N = XMVectorMultiplyAdd(VC, N, VC); - XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon); - D = XMVectorMultiplyAdd(VC2, D, T0); - - N = XMVectorSelect(N, VC, VCNearZero); - D = XMVectorSelect(D, g_XMOne.v, VCNearZero); - - XMVECTOR R0 = XMVectorNegate(N); - XMVECTOR R1 = XMVectorDivide(N,D); - R0 = XMVectorDivide(D,R0); - - XMVECTOR VIsZero = XMVectorEqual(V, Zero); - - XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven); - - Result = XMVectorSelect(Result, Zero, VIsZero); - - return Result; - -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorSinH -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = sinhf( V.vector4_f32[0] ); - Result.vector4_f32[1] = sinhf( V.vector4_f32[1] ); - Result.vector4_f32[2] = sinhf( V.vector4_f32[2] ); - Result.vector4_f32[3] = sinhf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) - - XMVECTOR V1 = vmlaq_f32( g_XMNegativeOne.v, V, Scale.v ); - XMVECTOR V2 = vmlsq_f32( g_XMNegativeOne.v, V, Scale.v ); - XMVECTOR E1 = XMVectorExp(V1); - XMVECTOR E2 = XMVectorExp(V2); - - return vsubq_f32(E1, E2); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) - - XMVECTOR V1 = _mm_mul_ps(V, Scale); - V1 = _mm_add_ps(V1,g_XMNegativeOne); - XMVECTOR V2 = _mm_mul_ps(V, Scale); - V2 = _mm_sub_ps(g_XMNegativeOne,V2); - XMVECTOR E1 = XMVectorExp(V1); - XMVECTOR E2 = XMVectorExp(V2); - - return _mm_sub_ps(E1, E2); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorCosH -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = coshf( V.vector4_f32[0] ); - Result.vector4_f32[1] = coshf( V.vector4_f32[1] ); - Result.vector4_f32[2] = coshf( V.vector4_f32[2] ); - Result.vector4_f32[3] = coshf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) - - XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v); - XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v); - XMVECTOR E1 = XMVectorExp(V1); - XMVECTOR E2 = XMVectorExp(V2); - return vaddq_f32(E1, E2); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) - - XMVECTOR V1 = _mm_mul_ps(V,Scale.v); - V1 = _mm_add_ps(V1,g_XMNegativeOne.v); - XMVECTOR V2 = _mm_mul_ps(V, Scale.v); - V2 = _mm_sub_ps(g_XMNegativeOne.v,V2); - XMVECTOR E1 = XMVectorExp(V1); - XMVECTOR E2 = XMVectorExp(V2); - return _mm_add_ps(E1, E2); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorTanH -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = tanhf( V.vector4_f32[0] ); - Result.vector4_f32[1] = tanhf( V.vector4_f32[1] ); - Result.vector4_f32[2] = tanhf( V.vector4_f32[2] ); - Result.vector4_f32[3] = tanhf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f) - - XMVECTOR E = vmulq_f32(V, Scale.v); - E = XMVectorExp(E); - E = vmlaq_f32( g_XMOneHalf.v, E, g_XMOneHalf.v ); - E = XMVectorReciprocal(E); - return vsubq_f32(g_XMOne.v, E); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f) - - XMVECTOR E = _mm_mul_ps(V, Scale.v); - E = XMVectorExp(E); - E = _mm_mul_ps(E,g_XMOneHalf.v); - E = _mm_add_ps(E,g_XMOneHalf.v); - E = _mm_div_ps(g_XMOne.v,E); - return _mm_sub_ps(g_XMOne.v,E); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorASin -( - FXMVECTOR V -) -{ - // 7-degree minimax approximation - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = asinf( V.vector4_f32[0] ); - Result.vector4_f32[1] = asinf( V.vector4_f32[1] ); - Result.vector4_f32[2] = asinf( V.vector4_f32[2] ); - Result.vector4_f32[3] = asinf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); - float32x4_t x = vabsq_f32(V); - - // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. - float32x4_t oneMValue = vsubq_f32(g_XMOne, x); - float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); - float32x4_t root = XMVectorSqrt(clampOneMValue); - - // Compute polynomial approximation - const XMVECTOR AC1 = g_XMArcCoefficients1; - XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); - XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AC1), 1 ); - - vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); - t0 = vmlaq_f32( vConstants, t0, x ); - - vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); - t0 = vmlaq_f32( vConstants, t0, x ); - - const XMVECTOR AC0 = g_XMArcCoefficients0; - vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); - t0 = vmlaq_f32( vConstants, t0, x ); - - vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); - t0 = vmlaq_f32( vConstants, t0, x ); - - vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); - t0 = vmlaq_f32( vConstants, t0, x ); - - vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); - t0 = vmlaq_f32( vConstants, t0, x ); - t0 = vmulq_f32(t0, root); - - float32x4_t t1 = vsubq_f32(g_XMPi, t0); - t0 = vbslq_f32( nonnegative, t0, t1 ); - t0 = vsubq_f32(g_XMHalfPi, t0); - return t0; -#elif defined(_XM_SSE_INTRINSICS_) - __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); - __m128 mvalue = _mm_sub_ps(g_XMZero, V); - __m128 x = _mm_max_ps(V, mvalue); // |V| - - // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. - __m128 oneMValue = _mm_sub_ps(g_XMOne, x); - __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); - __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) - - // Compute polynomial approximation - const XMVECTOR AC1 = g_XMArcCoefficients1; - XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) ); - __m128 t0 = _mm_mul_ps(vConstants, x); - - vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - const XMVECTOR AC0 = g_XMArcCoefficients0; - vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AC0,_MM_SHUFFLE(2, 2, 2, 2) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, root); - - __m128 t1 = _mm_sub_ps(g_XMPi, t0); - t0 = _mm_and_ps(nonnegative, t0); - t1 = _mm_andnot_ps(nonnegative, t1); - t0 = _mm_or_ps(t0, t1); - t0 = _mm_sub_ps(g_XMHalfPi, t0); - return t0; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorACos -( - FXMVECTOR V -) -{ - // 7-degree minimax approximation - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = acosf( V.vector4_f32[0] ); - Result.vector4_f32[1] = acosf( V.vector4_f32[1] ); - Result.vector4_f32[2] = acosf( V.vector4_f32[2] ); - Result.vector4_f32[3] = acosf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); - float32x4_t x = vabsq_f32(V); - - // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. - float32x4_t oneMValue = vsubq_f32(g_XMOne, x); - float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); - float32x4_t root = XMVectorSqrt(clampOneMValue); - - // Compute polynomial approximation - const XMVECTOR AC1 = g_XMArcCoefficients1; - XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); - XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AC1), 1 ); - - vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); - t0 = vmlaq_f32( vConstants, t0, x ); - - vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); - t0 = vmlaq_f32( vConstants, t0, x ); - - const XMVECTOR AC0 = g_XMArcCoefficients0; - vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); - t0 = vmlaq_f32( vConstants, t0, x ); - - vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); - t0 = vmlaq_f32( vConstants, t0, x ); - - vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); - t0 = vmlaq_f32( vConstants, t0, x ); - - vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); - t0 = vmlaq_f32( vConstants, t0, x ); - t0 = vmulq_f32(t0, root); - - float32x4_t t1 = vsubq_f32(g_XMPi, t0); - t0 = vbslq_f32( nonnegative, t0, t1 ); - return t0; -#elif defined(_XM_SSE_INTRINSICS_) - __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); - __m128 mvalue = _mm_sub_ps(g_XMZero, V); - __m128 x = _mm_max_ps(V, mvalue); // |V| - - // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. - __m128 oneMValue = _mm_sub_ps(g_XMOne, x); - __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); - __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) - - // Compute polynomial approximation - const XMVECTOR AC1 = g_XMArcCoefficients1; - XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) ); - __m128 t0 = _mm_mul_ps(vConstants, x); - - vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - const XMVECTOR AC0 = g_XMArcCoefficients0; - vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(2, 2, 2, 2) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, root); - - __m128 t1 = _mm_sub_ps(g_XMPi, t0); - t0 = _mm_and_ps(nonnegative, t0); - t1 = _mm_andnot_ps(nonnegative, t1); - t0 = _mm_or_ps(t0, t1); - return t0; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorATan -( - FXMVECTOR V -) -{ - // 17-degree minimax approximation - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = atanf( V.vector4_f32[0] ); - Result.vector4_f32[1] = atanf( V.vector4_f32[1] ); - Result.vector4_f32[2] = atanf( V.vector4_f32[2] ); - Result.vector4_f32[3] = atanf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t absV = vabsq_f32(V); - float32x4_t invV = XMVectorReciprocal(V); - uint32x4_t comp = vcgtq_f32(V, g_XMOne); - uint32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); - comp = vcleq_f32(absV, g_XMOne); - sign = vbslq_f32(comp, g_XMZero, sign); - uint32x4_t x = vbslq_f32(comp, V, invV); - - float32x4_t x2 = vmulq_f32(x, x); - - // Compute polynomial approximation - const XMVECTOR TC1 = g_XMATanCoefficients1; - XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0); - XMVECTOR Result = vmlaq_lane_f32( vConstants, x2, vget_high_f32(TC1), 1 ); - - vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1); - Result = vmlaq_f32( vConstants, Result, x2 ); - - vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0); - Result = vmlaq_f32( vConstants, Result, x2 ); - - const XMVECTOR TC0 = g_XMATanCoefficients0; - vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1); - Result = vmlaq_f32( vConstants, Result, x2 ); - - vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0); - Result = vmlaq_f32( vConstants, Result, x2 ); - - vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1); - Result = vmlaq_f32( vConstants, Result, x2 ); - - vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0); - Result = vmlaq_f32( vConstants, Result, x2 ); - - Result = vmlaq_f32( g_XMOne, Result, x2 ); - Result = vmulq_f32( Result, x ); - - float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); - result1 = vsubq_f32(result1, Result); - - comp = vceqq_f32(sign, g_XMZero); - Result = vbslq_f32( comp, Result, result1 ); - return Result; -#elif defined(_XM_SSE_INTRINSICS_) - __m128 absV = XMVectorAbs(V); - __m128 invV = _mm_div_ps(g_XMOne, V); - __m128 comp = _mm_cmpgt_ps(V, g_XMOne); - __m128 select0 = _mm_and_ps(comp, g_XMOne); - __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); - __m128 sign = _mm_or_ps(select0, select1); - comp = _mm_cmple_ps(absV, g_XMOne); - select0 = _mm_and_ps(comp, g_XMZero); - select1 = _mm_andnot_ps(comp, sign); - sign = _mm_or_ps(select0, select1); - select0 = _mm_and_ps(comp, V); - select1 = _mm_andnot_ps(comp, invV); - __m128 x = _mm_or_ps(select0, select1); - - __m128 x2 = _mm_mul_ps(x, x); - - // Compute polynomial approximation - const XMVECTOR TC1 = g_XMATanCoefficients1; - XMVECTOR vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(3, 3, 3, 3) ); - __m128 Result = _mm_mul_ps(vConstants, x2); - - vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(2, 2, 2, 2) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(1, 1, 1, 1) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(0, 0, 0, 0) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - const XMVECTOR TC0 = g_XMATanCoefficients0; - vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(3, 3, 3, 3) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(2, 2, 2, 2) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(1, 1, 1, 1) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(0, 0, 0, 0) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - Result = _mm_add_ps(Result, g_XMOne); - Result = _mm_mul_ps(Result, x); - __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); - result1 = _mm_sub_ps(result1, Result); - - comp = _mm_cmpeq_ps(sign, g_XMZero); - select0 = _mm_and_ps(comp, Result); - select1 = _mm_andnot_ps(comp, result1); - Result = _mm_or_ps(select0, select1); - return Result; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorATan2 -( - FXMVECTOR Y, - FXMVECTOR X -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = atan2f( Y.vector4_f32[0], X.vector4_f32[0] ); - Result.vector4_f32[1] = atan2f( Y.vector4_f32[1], X.vector4_f32[1] ); - Result.vector4_f32[2] = atan2f( Y.vector4_f32[2], X.vector4_f32[2] ); - Result.vector4_f32[3] = atan2f( Y.vector4_f32[3], X.vector4_f32[3] ); - return Result; -#else - - // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: - - // Y == 0 and X is Negative -> Pi with the sign of Y - // y == 0 and x is positive -> 0 with the sign of y - // Y != 0 and X == 0 -> Pi / 2 with the sign of Y - // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) - // X == -Infinity and Finite Y -> Pi with the sign of Y - // X == +Infinity and Finite Y -> 0 with the sign of Y - // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y - // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y - // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y - - static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}; - - XMVECTOR Zero = XMVectorZero(); - XMVECTOR ATanResultValid = XMVectorTrueInt(); - - XMVECTOR Pi = XMVectorSplatX(ATan2Constants); - XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); - XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); - XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); - - XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); - XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); - XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); - XIsPositive = XMVectorEqualInt(XIsPositive, Zero); - XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); - XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); - - XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); - Pi = XMVectorOrInt(Pi, YSign); - PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); - PiOverFour = XMVectorOrInt(PiOverFour, YSign); - ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); - - XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); - XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); - XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); - XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); - XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); - XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); - ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); - - XMVECTOR V = XMVectorDivide(Y, X); - - XMVECTOR R0 = XMVectorATan(V); - - R1 = XMVectorSelect( Pi, g_XMNegativeZero, XIsPositive ); - R2 = XMVectorAdd(R0, R1); - - return XMVectorSelect(Result, R2, ATanResultValid); - -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorSinEst -( - FXMVECTOR V -) -{ - // 7-degree minimax approximation - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = sinf( V.vector4_f32[0] ); - Result.vector4_f32[1] = sinf( V.vector4_f32[1] ); - Result.vector4_f32[2] = sinf( V.vector4_f32[2] ); - Result.vector4_f32[3] = sinf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Force the value within the bounds of pi - XMVECTOR x = XMVectorModAngles(V); - - // Map in [-pi/2,pi/2] with sin(y) = sin(x). - uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); - uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 - float32x4_t absx = vabsq_f32( x ); - float32x4_t rflx = vsubq_f32(c, x); - uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); - x = vbslq_f32( comp, x, rflx ); - - float32x4_t x2 = vmulq_f32(x, x); - - // Compute polynomial approximation - const XMVECTOR SEC = g_XMSinCoefficients1; - XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); - XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1); - - vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); - Result = vmlaq_f32(vConstants, Result, x2); - - Result = vmlaq_f32(g_XMOne, Result, x2); - Result = vmulq_f32(Result, x); - return Result; -#elif defined(_XM_SSE_INTRINSICS_) - // Force the value within the bounds of pi - XMVECTOR x = XMVectorModAngles(V); - - // Map in [-pi/2,pi/2] with sin(y) = sin(x). - __m128 sign = _mm_and_ps(x, g_XMNegativeZero); - __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 - __m128 absx = _mm_andnot_ps(sign, x); // |x| - __m128 rflx = _mm_sub_ps(c, x); - __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); - __m128 select0 = _mm_and_ps(comp, x); - __m128 select1 = _mm_andnot_ps(comp, rflx); - x = _mm_or_ps(select0, select1); - - __m128 x2 = _mm_mul_ps(x, x); - - // Compute polynomial approximation - const XMVECTOR SEC = g_XMSinCoefficients1; - XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) ); - __m128 Result = _mm_mul_ps(vConstants, x2); - - vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - Result = _mm_add_ps(Result, g_XMOne); - Result = _mm_mul_ps(Result, x); - return Result; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorCosEst -( - FXMVECTOR V -) -{ - // 6-degree minimax approximation - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = cosf( V.vector4_f32[0] ); - Result.vector4_f32[1] = cosf( V.vector4_f32[1] ); - Result.vector4_f32[2] = cosf( V.vector4_f32[2] ); - Result.vector4_f32[3] = cosf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Map V to x in [-pi,pi]. - XMVECTOR x = XMVectorModAngles(V); - - // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). - uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); - uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 - float32x4_t absx = vabsq_f32( x ); - float32x4_t rflx = vsubq_f32(c, x); - uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); - x = vbslq_f32( comp, x, rflx ); - sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); - - float32x4_t x2 = vmulq_f32(x, x); - - // Compute polynomial approximation - const XMVECTOR CEC = g_XMCosCoefficients1; - XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); - XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1); - - vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); - Result = vmlaq_f32(vConstants, Result, x2); - - Result = vmlaq_f32(g_XMOne, Result, x2); - Result = vmulq_f32(Result, sign); - return Result; -#elif defined(_XM_SSE_INTRINSICS_) - // Map V to x in [-pi,pi]. - XMVECTOR x = XMVectorModAngles(V); - - // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). - XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); - __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 - __m128 absx = _mm_andnot_ps(sign, x); // |x| - __m128 rflx = _mm_sub_ps(c, x); - __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); - __m128 select0 = _mm_and_ps(comp, x); - __m128 select1 = _mm_andnot_ps(comp, rflx); - x = _mm_or_ps(select0, select1); - select0 = _mm_and_ps(comp, g_XMOne); - select1 = _mm_andnot_ps(comp, g_XMNegativeOne); - sign = _mm_or_ps(select0, select1); - - __m128 x2 = _mm_mul_ps(x, x); - - // Compute polynomial approximation - const XMVECTOR CEC = g_XMCosCoefficients1; - XMVECTOR vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) ); - __m128 Result = _mm_mul_ps(vConstants, x2); - - vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - Result = _mm_add_ps(Result, g_XMOne); - Result = _mm_mul_ps(Result, sign); - return Result; -#endif -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline void XM_CALLCONV XMVectorSinCosEst -( - XMVECTOR* pSin, - XMVECTOR* pCos, - FXMVECTOR V -) -{ - assert(pSin != nullptr); - assert(pCos != nullptr); - - // 7/6-degree minimax approximation - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Sin; - Sin.vector4_f32[0] = sinf( V.vector4_f32[0] ); - Sin.vector4_f32[1] = sinf( V.vector4_f32[1] ); - Sin.vector4_f32[2] = sinf( V.vector4_f32[2] ); - Sin.vector4_f32[3] = sinf( V.vector4_f32[3] ); - - XMVECTOR Cos; - Cos.vector4_f32[0] = cosf( V.vector4_f32[0] ); - Cos.vector4_f32[1] = cosf( V.vector4_f32[1] ); - Cos.vector4_f32[2] = cosf( V.vector4_f32[2] ); - Cos.vector4_f32[3] = cosf( V.vector4_f32[3] ); - - *pSin = Sin; - *pCos = Cos; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Force the value within the bounds of pi - XMVECTOR x = XMVectorModAngles(V); - - // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). - uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); - uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 - float32x4_t absx = vabsq_f32( x ); - float32x4_t rflx = vsubq_f32(c, x); - uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); - x = vbslq_f32( comp, x, rflx ); - sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); - - float32x4_t x2 = vmulq_f32(x, x); - - // Compute polynomial approximation for sine - const XMVECTOR SEC = g_XMSinCoefficients1; - XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); - XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1); - - vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); - Result = vmlaq_f32(vConstants, Result, x2); - - Result = vmlaq_f32(g_XMOne, Result, x2); - *pSin = vmulq_f32(Result, x); - - // Compute polynomial approximation - const XMVECTOR CEC = g_XMCosCoefficients1; - vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); - Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1); - - vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); - Result = vmlaq_f32(vConstants, Result, x2); - - Result = vmlaq_f32(g_XMOne, Result, x2); - *pCos = vmulq_f32(Result, sign); -#elif defined(_XM_SSE_INTRINSICS_) - // Force the value within the bounds of pi - XMVECTOR x = XMVectorModAngles(V); - - // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). - XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); - __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 - __m128 absx = _mm_andnot_ps(sign, x); // |x| - __m128 rflx = _mm_sub_ps(c, x); - __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); - __m128 select0 = _mm_and_ps(comp, x); - __m128 select1 = _mm_andnot_ps(comp, rflx); - x = _mm_or_ps(select0, select1); - select0 = _mm_and_ps(comp, g_XMOne); - select1 = _mm_andnot_ps(comp, g_XMNegativeOne); - sign = _mm_or_ps(select0, select1); - - __m128 x2 = _mm_mul_ps(x, x); - - // Compute polynomial approximation for sine - const XMVECTOR SEC = g_XMSinCoefficients1; - XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) ); - __m128 Result = _mm_mul_ps(vConstants, x2); - - vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - Result = _mm_add_ps(Result, g_XMOne); - Result = _mm_mul_ps(Result, x); - *pSin = Result; - - // Compute polynomial approximation for cosine - const XMVECTOR CEC = g_XMCosCoefficients1; - vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) ); - Result = _mm_mul_ps(vConstants, x2); - - vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - Result = _mm_add_ps(Result, g_XMOne); - Result = _mm_mul_ps(Result, sign); - *pCos = Result; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorTanEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = tanf( V.vector4_f32[0] ); - Result.vector4_f32[1] = tanf( V.vector4_f32[1] ); - Result.vector4_f32[2] = tanf( V.vector4_f32[2] ); - Result.vector4_f32[3] = tanf( V.vector4_f32[3] ); - return Result; -#else - - XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v); - - XMVECTOR V1 = XMVectorMultiply(V, OneOverPi); - V1 = XMVectorRound(V1); - - V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V); - - XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v); - XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v); - XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v); - - XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2); - XMVECTOR V2 = XMVectorMultiply(V1, V1); - XMVECTOR V1T0 = XMVectorMultiply(V1, T0); - XMVECTOR V1T1 = XMVectorMultiply(V1, T1); - - XMVECTOR D = XMVectorReciprocalEst(V2T2); - XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0); - - return XMVectorMultiply(N, D); - -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorASinEst -( - FXMVECTOR V -) -{ - // 3-degree minimax approximation - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = asinf( V.vector4_f32[0] ); - Result.vector4_f32[1] = asinf( V.vector4_f32[1] ); - Result.vector4_f32[2] = asinf( V.vector4_f32[2] ); - Result.vector4_f32[3] = asinf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); - float32x4_t x = vabsq_f32(V); - - // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. - float32x4_t oneMValue = vsubq_f32(g_XMOne, x); - float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); - float32x4_t root = XMVectorSqrt(clampOneMValue); - - // Compute polynomial approximation - const XMVECTOR AEC = g_XMArcEstCoefficients; - XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); - XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AEC), 1 ); - - vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); - t0 = vmlaq_f32( vConstants, t0, x ); - - vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); - t0 = vmlaq_f32( vConstants, t0, x ); - t0 = vmulq_f32(t0, root); - - float32x4_t t1 = vsubq_f32(g_XMPi, t0); - t0 = vbslq_f32( nonnegative, t0, t1 ); - t0 = vsubq_f32(g_XMHalfPi, t0); - return t0; -#elif defined(_XM_SSE_INTRINSICS_) - __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); - __m128 mvalue = _mm_sub_ps(g_XMZero, V); - __m128 x = _mm_max_ps(V, mvalue); // |V| - - // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. - __m128 oneMValue = _mm_sub_ps(g_XMOne, x); - __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); - __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) - - // Compute polynomial approximation - const XMVECTOR AEC = g_XMArcEstCoefficients; - XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); - __m128 t0 = _mm_mul_ps(vConstants, x); - - vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, root); - - __m128 t1 = _mm_sub_ps(g_XMPi, t0); - t0 = _mm_and_ps(nonnegative, t0); - t1 = _mm_andnot_ps(nonnegative, t1); - t0 = _mm_or_ps(t0, t1); - t0 = _mm_sub_ps(g_XMHalfPi, t0); - return t0; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorACosEst -( - FXMVECTOR V -) -{ - // 3-degree minimax approximation - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = acosf( V.vector4_f32[0] ); - Result.vector4_f32[1] = acosf( V.vector4_f32[1] ); - Result.vector4_f32[2] = acosf( V.vector4_f32[2] ); - Result.vector4_f32[3] = acosf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); - float32x4_t x = vabsq_f32(V); - - // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. - float32x4_t oneMValue = vsubq_f32(g_XMOne, x); - float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); - float32x4_t root = XMVectorSqrt(clampOneMValue); - - // Compute polynomial approximation - const XMVECTOR AEC = g_XMArcEstCoefficients; - XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); - XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AEC), 1 ); - - vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); - t0 = vmlaq_f32( vConstants, t0, x ); - - vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); - t0 = vmlaq_f32( vConstants, t0, x ); - t0 = vmulq_f32(t0, root); - - float32x4_t t1 = vsubq_f32(g_XMPi, t0); - t0 = vbslq_f32( nonnegative, t0, t1 ); - return t0; -#elif defined(_XM_SSE_INTRINSICS_) - __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); - __m128 mvalue = _mm_sub_ps(g_XMZero, V); - __m128 x = _mm_max_ps(V, mvalue); // |V| - - // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. - __m128 oneMValue = _mm_sub_ps(g_XMOne, x); - __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); - __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) - - // Compute polynomial approximation - const XMVECTOR AEC = g_XMArcEstCoefficients; - XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); - __m128 t0 = _mm_mul_ps(vConstants, x); - - vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, x); - - vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); - t0 = _mm_add_ps(t0, vConstants); - t0 = _mm_mul_ps(t0, root); - - __m128 t1 = _mm_sub_ps(g_XMPi, t0); - t0 = _mm_and_ps(nonnegative, t0); - t1 = _mm_andnot_ps(nonnegative, t1); - t0 = _mm_or_ps(t0, t1); - return t0; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorATanEst -( - FXMVECTOR V -) -{ - // 9-degree minimax approximation - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = atanf( V.vector4_f32[0] ); - Result.vector4_f32[1] = atanf( V.vector4_f32[1] ); - Result.vector4_f32[2] = atanf( V.vector4_f32[2] ); - Result.vector4_f32[3] = atanf( V.vector4_f32[3] ); - return Result; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t absV = vabsq_f32(V); - float32x4_t invV = XMVectorReciprocalEst(V); - uint32x4_t comp = vcgtq_f32(V, g_XMOne); - uint32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne ); - comp = vcleq_f32(absV, g_XMOne); - sign = vbslq_f32(comp, g_XMZero, sign ); - uint32x4_t x = vbslq_f32(comp, V, invV ); - - float32x4_t x2 = vmulq_f32(x, x); - - // Compute polynomial approximation - const XMVECTOR AEC = g_XMATanEstCoefficients1; - XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); - XMVECTOR Result = vmlaq_lane_f32( vConstants, x2, vget_high_f32(AEC), 1 ); - - vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); - Result = vmlaq_f32( vConstants, Result, x2 ); - - vConstants = vdupq_lane_f32(vget_low_f32( AEC), 0); - Result = vmlaq_f32( vConstants, Result, x2 ); - - // ATanEstCoefficients0 is already splatted - Result = vmlaq_f32( g_XMATanEstCoefficients0, Result, x2 ); - Result = vmulq_f32( Result, x ); - - float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); - result1 = vsubq_f32(result1, Result); - - comp = vceqq_f32(sign, g_XMZero); - Result = vbslq_f32( comp, Result, result1 ); - return Result; -#elif defined(_XM_SSE_INTRINSICS_) - __m128 absV = XMVectorAbs(V); - __m128 invV = _mm_div_ps(g_XMOne, V); - __m128 comp = _mm_cmpgt_ps(V, g_XMOne); - __m128 select0 = _mm_and_ps(comp, g_XMOne); - __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); - __m128 sign = _mm_or_ps(select0, select1); - comp = _mm_cmple_ps(absV, g_XMOne); - select0 = _mm_and_ps(comp, g_XMZero); - select1 = _mm_andnot_ps(comp, sign); - sign = _mm_or_ps(select0, select1); - select0 = _mm_and_ps(comp, V); - select1 = _mm_andnot_ps(comp, invV); - __m128 x = _mm_or_ps(select0, select1); - - __m128 x2 = _mm_mul_ps(x, x); - - // Compute polynomial approximation - const XMVECTOR AEC = g_XMATanEstCoefficients1; - XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); - __m128 Result = _mm_mul_ps(vConstants, x2); - - vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); - Result = _mm_add_ps(Result, vConstants); - Result = _mm_mul_ps(Result, x2); - - // ATanEstCoefficients0 is already splatted - Result = _mm_add_ps(Result, g_XMATanEstCoefficients0); - Result = _mm_mul_ps(Result, x); - __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); - result1 = _mm_sub_ps(result1, Result); - - comp = _mm_cmpeq_ps(sign, g_XMZero); - select0 = _mm_and_ps(comp, Result); - select1 = _mm_andnot_ps(comp, result1); - Result = _mm_or_ps(select0, select1); - return Result; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorATan2Est -( - FXMVECTOR Y, - FXMVECTOR X -) -{ -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - Result.vector4_f32[0] = atan2f( Y.vector4_f32[0], X.vector4_f32[0] ); - Result.vector4_f32[1] = atan2f( Y.vector4_f32[1], X.vector4_f32[1] ); - Result.vector4_f32[2] = atan2f( Y.vector4_f32[2], X.vector4_f32[2] ); - Result.vector4_f32[3] = atan2f( Y.vector4_f32[3], X.vector4_f32[3] ); - return Result; -#else - - static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */}; - - const XMVECTOR Zero = XMVectorZero(); - XMVECTOR ATanResultValid = XMVectorTrueInt(); - - XMVECTOR Pi = XMVectorSplatX(ATan2Constants); - XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); - XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); - XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); - - XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); - XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); - XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); - XIsPositive = XMVectorEqualInt(XIsPositive, Zero); - XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); - XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); - - XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); - Pi = XMVectorOrInt(Pi, YSign); - PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); - PiOverFour = XMVectorOrInt(PiOverFour, YSign); - ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); - - XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); - XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); - XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); - XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); - XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); - XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); - ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); - - XMVECTOR Reciprocal = XMVectorReciprocalEst(X); - XMVECTOR V = XMVectorMultiply(Y, Reciprocal); - XMVECTOR R0 = XMVectorATanEst(V); - - R1 = XMVectorSelect( Pi, g_XMNegativeZero, XIsPositive ); - R2 = XMVectorAdd(R0, R1); - - Result = XMVectorSelect(Result, R2, ATanResultValid); - - return Result; - -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorLerp -( - FXMVECTOR V0, - FXMVECTOR V1, - float t -) -{ - // V0 + t * (V1 - V0) - -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Scale = XMVectorReplicate(t); - XMVECTOR Length = XMVectorSubtract(V1, V0); - return XMVectorMultiplyAdd(Length, Scale, V0); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR L = vsubq_f32( V1, V0 ); - return vmlaq_n_f32( V0, L, t ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR L = _mm_sub_ps( V1, V0 ); - XMVECTOR S = _mm_set_ps1( t ); - XMVECTOR Result = _mm_mul_ps( L, S ); - return _mm_add_ps( Result, V0 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorLerpV -( - FXMVECTOR V0, - FXMVECTOR V1, - FXMVECTOR T -) -{ - // V0 + T * (V1 - V0) - -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Length = XMVectorSubtract(V1, V0); - return XMVectorMultiplyAdd(Length, T, V0); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR L = vsubq_f32( V1, V0 ); - return vmlaq_f32( V0, L, T ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR Length = _mm_sub_ps( V1, V0 ); - XMVECTOR Result = _mm_mul_ps( Length, T ); - return _mm_add_ps( Result, V0 ); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorHermite -( - FXMVECTOR Position0, - FXMVECTOR Tangent0, - FXMVECTOR Position1, - GXMVECTOR Tangent1, - float t -) -{ - // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + - // (t^3 - 2 * t^2 + t) * Tangent0 + - // (-2 * t^3 + 3 * t^2) * Position1 + - // (t^3 - t^2) * Tangent1 - -#if defined(_XM_NO_INTRINSICS_) - - float t2 = t * t; - float t3 = t * t2; - - XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f); - XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t); - XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2); - XMVECTOR T1 = XMVectorReplicate(t3 - t2); - - XMVECTOR Result = XMVectorMultiply(P0, Position0); - Result = XMVectorMultiplyAdd(T0, Tangent0, Result); - Result = XMVectorMultiplyAdd(P1, Position1, Result); - Result = XMVectorMultiplyAdd(T1, Tangent1, Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float t2 = t * t; - float t3 = t * t2; - - float p0 = 2.0f * t3 - 3.0f * t2 + 1.0f; - float t0 = t3 - 2.0f * t2 + t; - float p1 = -2.0f * t3 + 3.0f * t2; - float t1 = t3 - t2; - - XMVECTOR vResult = vmulq_n_f32(Position0, p0 ); - vResult = vmlaq_n_f32( vResult, Tangent0, t0 ); - vResult = vmlaq_n_f32( vResult, Position1, p1 ); - vResult = vmlaq_n_f32( vResult, Tangent1, t1 ); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - float t2 = t * t; - float t3 = t * t2; - - XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f); - XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t); - XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2); - XMVECTOR T1 = _mm_set_ps1(t3 - t2); - - XMVECTOR vResult = _mm_mul_ps(P0, Position0); - XMVECTOR vTemp = _mm_mul_ps(T0, Tangent0); - vResult = _mm_add_ps(vResult,vTemp); - vTemp = _mm_mul_ps(P1, Position1); - vResult = _mm_add_ps(vResult,vTemp); - vTemp = _mm_mul_ps(T1, Tangent1); - vResult = _mm_add_ps(vResult,vTemp); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorHermiteV -( - FXMVECTOR Position0, - FXMVECTOR Tangent0, - FXMVECTOR Position1, - GXMVECTOR Tangent1, - HXMVECTOR T -) -{ - // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + - // (t^3 - 2 * t^2 + t) * Tangent0 + - // (-2 * t^3 + 3 * t^2) * Position1 + - // (t^3 - t^2) * Tangent1 - -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR T2 = XMVectorMultiply(T, T); - XMVECTOR T3 = XMVectorMultiply(T , T2); - - XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f); - XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]); - XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]); - XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]); - - XMVECTOR Result = XMVectorMultiply(P0, Position0); - Result = XMVectorMultiplyAdd(T0, Tangent0, Result); - Result = XMVectorMultiplyAdd(P1, Position1, Result); - Result = XMVectorMultiplyAdd(T1, Tangent1, Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f}; - static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f}; - - XMVECTOR T2 = vmulq_f32(T,T); - XMVECTOR T3 = vmulq_f32(T,T2); - // Mul by the constants against t^2 - T2 = vmulq_f32(T2,CatMulT2); - // Mul by the constants against t^3 - T3 = vmlaq_f32(T2, T3, CatMulT3 ); - // T3 now has the pre-result. - // I need to add t.y only - T2 = vandq_u32(T,g_XMMaskY); - T3 = vaddq_f32(T3,T2); - // Add 1.0f to x - T3 = vaddq_f32(T3,g_XMIdentityR0); - // Now, I have the constants created - // Mul the x constant to Position0 - XMVECTOR vResult = vmulq_lane_f32( Position0, vget_low_f32( T3 ), 0 ); // T3[0] - // Mul the y constant to Tangent0 - vResult = vmlaq_lane_f32(vResult, Tangent0, vget_low_f32( T3 ), 1 ); // T3[1] - // Mul the z constant to Position1 - vResult = vmlaq_lane_f32(vResult, Position1, vget_high_f32( T3 ), 0 ); // T3[2] - // Mul the w constant to Tangent1 - vResult = vmlaq_lane_f32(vResult, Tangent1, vget_high_f32( T3 ), 1 ); // T3[3] - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f}; - static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f}; - - XMVECTOR T2 = _mm_mul_ps(T,T); - XMVECTOR T3 = _mm_mul_ps(T,T2); - // Mul by the constants against t^2 - T2 = _mm_mul_ps(T2,CatMulT2); - // Mul by the constants against t^3 - T3 = _mm_mul_ps(T3,CatMulT3); - // T3 now has the pre-result. - T3 = _mm_add_ps(T3,T2); - // I need to add t.y only - T2 = _mm_and_ps(T,g_XMMaskY); - T3 = _mm_add_ps(T3,T2); - // Add 1.0f to x - T3 = _mm_add_ps(T3,g_XMIdentityR0); - // Now, I have the constants created - // Mul the x constant to Position0 - XMVECTOR vResult = XM_PERMUTE_PS(T3,_MM_SHUFFLE(0,0,0,0)); - vResult = _mm_mul_ps(vResult,Position0); - // Mul the y constant to Tangent0 - T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(1,1,1,1)); - T2 = _mm_mul_ps(T2,Tangent0); - vResult = _mm_add_ps(vResult,T2); - // Mul the z constant to Position1 - T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(2,2,2,2)); - T2 = _mm_mul_ps(T2,Position1); - vResult = _mm_add_ps(vResult,T2); - // Mul the w constant to Tangent1 - T3 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(3,3,3,3)); - T3 = _mm_mul_ps(T3,Tangent1); - vResult = _mm_add_ps(vResult,T3); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorCatmullRom -( - FXMVECTOR Position0, - FXMVECTOR Position1, - FXMVECTOR Position2, - GXMVECTOR Position3, - float t -) -{ - // Result = ((-t^3 + 2 * t^2 - t) * Position0 + - // (3 * t^3 - 5 * t^2 + 2) * Position1 + - // (-3 * t^3 + 4 * t^2 + t) * Position2 + - // (t^3 - t^2) * Position3) * 0.5 - -#if defined(_XM_NO_INTRINSICS_) - - float t2 = t * t; - float t3 = t * t2; - - XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f); - XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); - XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); - XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f); - - XMVECTOR Result = XMVectorMultiply(P0, Position0); - Result = XMVectorMultiplyAdd(P1, Position1, Result); - Result = XMVectorMultiplyAdd(P2, Position2, Result); - Result = XMVectorMultiplyAdd(P3, Position3, Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float t2 = t * t; - float t3 = t * t2; - - float p0 = (-t3 + 2.0f * t2 - t) * 0.5f; - float p1 = (3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f; - float p2 = (-3.0f * t3 + 4.0f * t2 + t) * 0.5f; - float p3 = (t3 - t2) * 0.5f; - - XMVECTOR P1 = vmulq_n_f32(Position1, p1); - XMVECTOR P0 = vmlaq_n_f32(P1, Position0, p0); - XMVECTOR P3 = vmulq_n_f32(Position3, p3); - XMVECTOR P2 = vmlaq_n_f32(P3, Position2, p2); - P0 = vaddq_f32(P0,P2); - return P0; -#elif defined(_XM_SSE_INTRINSICS_) - float t2 = t * t; - float t3 = t * t2; - - XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f); - XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); - XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); - XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f); - - P0 = _mm_mul_ps(P0, Position0); - P1 = _mm_mul_ps(P1, Position1); - P2 = _mm_mul_ps(P2, Position2); - P3 = _mm_mul_ps(P3, Position3); - P0 = _mm_add_ps(P0,P1); - P2 = _mm_add_ps(P2,P3); - P0 = _mm_add_ps(P0,P2); - return P0; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorCatmullRomV -( - FXMVECTOR Position0, - FXMVECTOR Position1, - FXMVECTOR Position2, - GXMVECTOR Position3, - HXMVECTOR T -) -{ -#if defined(_XM_NO_INTRINSICS_) - float fx = T.vector4_f32[0]; - float fy = T.vector4_f32[1]; - float fz = T.vector4_f32[2]; - float fw = T.vector4_f32[3]; - XMVECTOR vResult; - vResult.vector4_f32[0] = 0.5f*((-fx*fx*fx+2*fx*fx-fx)*Position0.vector4_f32[0] - + (3*fx*fx*fx-5*fx*fx+2)*Position1.vector4_f32[0] - + (-3*fx*fx*fx+4*fx*fx+fx)*Position2.vector4_f32[0] - + (fx*fx*fx-fx*fx)*Position3.vector4_f32[0]); - vResult.vector4_f32[1] = 0.5f*((-fy*fy*fy+2*fy*fy-fy)*Position0.vector4_f32[1] - + (3*fy*fy*fy-5*fy*fy+2)*Position1.vector4_f32[1] - + (-3*fy*fy*fy+4*fy*fy+fy)*Position2.vector4_f32[1] - + (fy*fy*fy-fy*fy)*Position3.vector4_f32[1]); - vResult.vector4_f32[2] = 0.5f*((-fz*fz*fz+2*fz*fz-fz)*Position0.vector4_f32[2] - + (3*fz*fz*fz-5*fz*fz+2)*Position1.vector4_f32[2] - + (-3*fz*fz*fz+4*fz*fz+fz)*Position2.vector4_f32[2] - + (fz*fz*fz-fz*fz)*Position3.vector4_f32[2]); - vResult.vector4_f32[3] = 0.5f*((-fw*fw*fw+2*fw*fw-fw)*Position0.vector4_f32[3] - + (3*fw*fw*fw-5*fw*fw+2)*Position1.vector4_f32[3] - + (-3*fw*fw*fw+4*fw*fw+fw)*Position2.vector4_f32[3] - + (fw*fw*fw-fw*fw)*Position3.vector4_f32[3]); - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f}; - static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f}; - static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f}; - static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f}; - // Cache T^2 and T^3 - XMVECTOR T2 = vmulq_f32(T,T); - XMVECTOR T3 = vmulq_f32(T,T2); - // Perform the Position0 term - XMVECTOR vResult = vaddq_f32(T2,T2); - vResult = vsubq_f32(vResult,T); - vResult = vsubq_f32(vResult,T3); - vResult = vmulq_f32(vResult,Position0); - // Perform the Position1 term and add - XMVECTOR vTemp = vmulq_f32(T3,Catmul3); - vTemp = vmlsq_f32(vTemp, T2, Catmul5); - vTemp = vaddq_f32(vTemp,Catmul2); - vResult = vmlaq_f32(vResult, vTemp, Position1); - // Perform the Position2 term and add - vTemp = vmulq_f32(T2,Catmul4); - vTemp = vmlsq_f32(vTemp, T3, Catmul3); - vTemp = vaddq_f32(vTemp,T); - vResult = vmlaq_f32(vResult, vTemp, Position2); - // Position3 is the last term - T3 = vsubq_f32(T3,T2); - vResult = vmlaq_f32(vResult, T3, Position3); - // Multiply by 0.5f and exit - vResult = vmulq_f32(vResult,g_XMOneHalf); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f}; - static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f}; - static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f}; - static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f}; - // Cache T^2 and T^3 - XMVECTOR T2 = _mm_mul_ps(T,T); - XMVECTOR T3 = _mm_mul_ps(T,T2); - // Perform the Position0 term - XMVECTOR vResult = _mm_add_ps(T2,T2); - vResult = _mm_sub_ps(vResult,T); - vResult = _mm_sub_ps(vResult,T3); - vResult = _mm_mul_ps(vResult,Position0); - // Perform the Position1 term and add - XMVECTOR vTemp = _mm_mul_ps(T3,Catmul3); - XMVECTOR vTemp2 = _mm_mul_ps(T2,Catmul5); - vTemp = _mm_sub_ps(vTemp,vTemp2); - vTemp = _mm_add_ps(vTemp,Catmul2); - vTemp = _mm_mul_ps(vTemp,Position1); - vResult = _mm_add_ps(vResult,vTemp); - // Perform the Position2 term and add - vTemp = _mm_mul_ps(T2,Catmul4); - vTemp2 = _mm_mul_ps(T3,Catmul3); - vTemp = _mm_sub_ps(vTemp,vTemp2); - vTemp = _mm_add_ps(vTemp,T); - vTemp = _mm_mul_ps(vTemp,Position2); - vResult = _mm_add_ps(vResult,vTemp); - // Position3 is the last term - T3 = _mm_sub_ps(T3,T2); - T3 = _mm_mul_ps(T3,Position3); - vResult = _mm_add_ps(vResult,T3); - // Multiply by 0.5f and exit - vResult = _mm_mul_ps(vResult,g_XMOneHalf); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorBaryCentric -( - FXMVECTOR Position0, - FXMVECTOR Position1, - FXMVECTOR Position2, - float f, - float g -) -{ - // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) - -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR P10 = XMVectorSubtract(Position1, Position0); - XMVECTOR ScaleF = XMVectorReplicate(f); - - XMVECTOR P20 = XMVectorSubtract(Position2, Position0); - XMVECTOR ScaleG = XMVectorReplicate(g); - - XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0); - Result = XMVectorMultiplyAdd(P20, ScaleG, Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR R1 = vsubq_f32(Position1,Position0); - XMVECTOR R2 = vsubq_f32(Position2,Position0); - R1 = vmlaq_n_f32( Position0, R1, f); - return vmlaq_n_f32( R1, R2, g ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR R1 = _mm_sub_ps(Position1,Position0); - XMVECTOR SF = _mm_set_ps1(f); - XMVECTOR R2 = _mm_sub_ps(Position2,Position0); - XMVECTOR SG = _mm_set_ps1(g); - R1 = _mm_mul_ps(R1,SF); - R2 = _mm_mul_ps(R2,SG); - R1 = _mm_add_ps(R1,Position0); - R1 = _mm_add_ps(R1,R2); - return R1; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVectorBaryCentricV -( - FXMVECTOR Position0, - FXMVECTOR Position1, - FXMVECTOR Position2, - GXMVECTOR F, - HXMVECTOR G -) -{ - // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) - -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR P10 = XMVectorSubtract(Position1, Position0); - XMVECTOR P20 = XMVectorSubtract(Position2, Position0); - - XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0); - Result = XMVectorMultiplyAdd(P20, G, Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR R1 = vsubq_f32(Position1,Position0); - XMVECTOR R2 = vsubq_f32(Position2,Position0); - R1 = vmlaq_f32( Position0, R1, F ); - return vmlaq_f32( R1, R2, G); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR R1 = _mm_sub_ps(Position1,Position0); - XMVECTOR R2 = _mm_sub_ps(Position2,Position0); - R1 = _mm_mul_ps(R1,F); - R2 = _mm_mul_ps(R2,G); - R1 = _mm_add_ps(R1,Position0); - R1 = _mm_add_ps(R1,R2); - return R1; -#endif -} - -/**************************************************************************** - * - * 2D Vector - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ -// Comparison operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector2Equal -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); - return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); -// z and w are don't care - return (((_mm_movemask_ps(vTemp)&3)==3) != 0); -#endif -} - - -//------------------------------------------------------------------------------ - -inline uint32_t XM_CALLCONV XMVector2EqualR -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - uint32_t CR = 0; - if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && - (V1.vector4_f32[1] == V2.vector4_f32[1])) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && - (V1.vector4_f32[1] != V2.vector4_f32[1])) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); - uint64_t r = vget_lane_u64( vTemp, 0 ); - uint32_t CR = 0; - if ( r == 0xFFFFFFFFFFFFFFFFU ) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); -// z and w are don't care - int iTest = _mm_movemask_ps(vTemp)&3; - uint32_t CR = 0; - if (iTest==3) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTest) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector2EqualInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); - return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); - return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)==3) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline uint32_t XM_CALLCONV XMVector2EqualIntR -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - uint32_t CR = 0; - if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && - (V1.vector4_u32[1] == V2.vector4_u32[1])) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && - (V1.vector4_u32[1] != V2.vector4_u32[1])) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); - uint64_t r = vget_lane_u64( vTemp, 0 ); - uint32_t CR = 0; - if ( r == 0xFFFFFFFFFFFFFFFFU ) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); - int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&3; - uint32_t CR = 0; - if (iTest==3) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTest) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector2NearEqual -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR Epsilon -) -{ -#if defined(_XM_NO_INTRINSICS_) - float dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); - float dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); - return ((dx <= Epsilon.vector4_f32[0]) && - (dy <= Epsilon.vector4_f32[1])); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t vDelta = vsub_f32(vget_low_u32(V1), vget_low_u32(V2)); - uint32x2_t vTemp = vacle_f32( vDelta, vget_low_u32(Epsilon) ); - uint64_t r = vget_lane_u64( vTemp, 0 ); - return ( r == 0xFFFFFFFFFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - // Get the difference - XMVECTOR vDelta = _mm_sub_ps(V1,V2); - // Get the absolute value of the difference - XMVECTOR vTemp = _mm_setzero_ps(); - vTemp = _mm_sub_ps(vTemp,vDelta); - vTemp = _mm_max_ps(vTemp,vDelta); - vTemp = _mm_cmple_ps(vTemp,Epsilon); - // z and w are don't care - return (((_mm_movemask_ps(vTemp)&3)==0x3) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector2NotEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); - return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); -// z and w are don't care - return (((_mm_movemask_ps(vTemp)&3)!=3) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector2NotEqualInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); - return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); - return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)!=3) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector2Greater -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) ); - return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); -// z and w are don't care - return (((_mm_movemask_ps(vTemp)&3)==3) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline uint32_t XM_CALLCONV XMVector2GreaterR -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - uint32_t CR = 0; - if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && - (V1.vector4_f32[1] > V2.vector4_f32[1])) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && - (V1.vector4_f32[1] <= V2.vector4_f32[1])) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) ); - uint64_t r = vget_lane_u64( vTemp, 0 ); - uint32_t CR = 0; - if ( r == 0xFFFFFFFFFFFFFFFFU ) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); - int iTest = _mm_movemask_ps(vTemp)&3; - uint32_t CR = 0; - if (iTest==3) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTest) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector2GreaterOrEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) ); - return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); - return (((_mm_movemask_ps(vTemp)&3)==3) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline uint32_t XM_CALLCONV XMVector2GreaterOrEqualR -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - uint32_t CR = 0; - if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && - (V1.vector4_f32[1] >= V2.vector4_f32[1])) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && - (V1.vector4_f32[1] < V2.vector4_f32[1])) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) ); - uint64_t r = vget_lane_u64( vTemp, 0 ); - uint32_t CR = 0; - if ( r == 0xFFFFFFFFFFFFFFFFU ) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); - int iTest = _mm_movemask_ps(vTemp)&3; - uint32_t CR = 0; - if (iTest == 3) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTest) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector2Less -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vTemp = vclt_f32( vget_low_f32(V1), vget_low_f32(V2) ); - return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); - return (((_mm_movemask_ps(vTemp)&3)==3) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector2LessOrEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vTemp = vcle_f32( vget_low_f32(V1), vget_low_f32(V2) ); - return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmple_ps(V1,V2); - return (((_mm_movemask_ps(vTemp)&3)==3) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector2InBounds -( - FXMVECTOR V, - FXMVECTOR Bounds -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && - (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32( V ); - float32x2_t B = vget_low_f32( Bounds ); - // Test if less than or equal - uint32x2_t ivTemp1 = vcle_f32(VL,B); - // Negate the bounds - float32x2_t vTemp2 = vneg_f32(B); - // Test if greater or equal (Reversed) - uint32x2_t ivTemp2 = vcle_f32(vTemp2,VL); - // Blend answers - ivTemp1 = vand_u32(ivTemp1,ivTemp2); - // x and y in bounds? - return ( vget_lane_u64( ivTemp1, 0 ) == 0xFFFFFFFFFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - // Test if less than or equal - XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); - // Negate the bounds - XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); - // Test if greater or equal (Reversed) - vTemp2 = _mm_cmple_ps(vTemp2,V); - // Blend answers - vTemp1 = _mm_and_ps(vTemp1,vTemp2); - // x and y in bounds? (z and w are don't care) - return (((_mm_movemask_ps(vTemp1)&0x3)==0x3) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector2IsNaN -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (XMISNAN(V.vector4_f32[0]) || - XMISNAN(V.vector4_f32[1])); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32( V ); - // Test against itself. NaN is always not equal - uint32x2_t vTempNan = vceq_f32( VL, VL ); - // If x or y are NaN, the mask is zero - return ( vget_lane_u64( vTempNan, 0 ) != 0xFFFFFFFFFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - // Test against itself. NaN is always not equal - XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); - // If x or y are NaN, the mask is non-zero - return ((_mm_movemask_ps(vTempNan)&3) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector2IsInfinite -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - return (XMISINF(V.vector4_f32[0]) || - XMISINF(V.vector4_f32[1])); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Mask off the sign bit - uint32x2_t vTemp = vand_u32( vget_low_f32( V ) , vget_low_f32( g_XMAbsMask ) ); - // Compare to infinity - vTemp = vceq_f32(vTemp, vget_low_f32( g_XMInfinity) ); - // If any are infinity, the signs are true. - return vget_lane_u64( vTemp, 0 ) != 0; -#elif defined(_XM_SSE_INTRINSICS_) - // Mask off the sign bit - __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); - // Compare to infinity - vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); - // If x or z are infinity, the signs are true. - return ((_mm_movemask_ps(vTemp)&3) != 0); -#endif -} - -//------------------------------------------------------------------------------ -// Computation operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2Dot -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = - Result.vector4_f32[1] = - Result.vector4_f32[2] = - Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Perform the dot product on x and y - float32x2_t vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) ); - vTemp = vpadd_f32( vTemp, vTemp ); - return vcombine_f32( vTemp, vTemp ); -#elif defined(_XM_SSE4_INTRINSICS_) - return _mm_dp_ps( V1, V2, 0x3f ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vDot = _mm_mul_ps(V1, V2); - vDot = _mm_hadd_ps(vDot, vDot); - vDot = _mm_moveldup_ps(vDot); - return vDot; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x and y - XMVECTOR vLengthSq = _mm_mul_ps(V1,V2); - // vTemp has y splatted - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); - // x+y - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2Cross -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ] - -#if defined(_XM_NO_INTRINSICS_) - float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]); - XMVECTOR vResult; - vResult.vector4_f32[0] = - vResult.vector4_f32[1] = - vResult.vector4_f32[2] = - vResult.vector4_f32[3] = fCross; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 Negate = { 1.f, -1.f, 0, 0 }; - - float32x2_t vTemp = vmul_f32( vget_low_f32( V1 ), vrev64_f32( vget_low_f32( V2 ) ) ); - vTemp = vmul_f32( vTemp, vget_low_f32( Negate ) ); - vTemp = vpadd_f32( vTemp, vTemp ); - return vcombine_f32( vTemp, vTemp ); -#elif defined(_XM_SSE_INTRINSICS_) - // Swap x and y - XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(0,1,0,1)); - // Perform the muls - vResult = _mm_mul_ps(vResult,V1); - // Splat y - XMVECTOR vTemp = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1)); - // Sub the values - vResult = _mm_sub_ss(vResult,vTemp); - // Splat the cross product - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,0,0,0)); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2LengthSq -( - FXMVECTOR V -) -{ - return XMVector2Dot(V, V); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result = XMVector2LengthSq(V); - Result = XMVectorReciprocalSqrtEst(Result); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32(V); - // Dot2 - float32x2_t vTemp = vmul_f32( VL, VL ); - vTemp = vpadd_f32( vTemp, vTemp ); - // Reciprocal sqrt (estimate) - vTemp = vrsqrte_f32( vTemp ); - return vcombine_f32( vTemp, vTemp ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); - return _mm_rsqrt_ps( vTemp ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_rsqrt_ss(vTemp); - vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); - return vLengthSq; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x and y - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has y splatted - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); - // x+y - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - vLengthSq = _mm_rsqrt_ss(vLengthSq); - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result = XMVector2LengthSq(V); - Result = XMVectorReciprocalSqrt(Result); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32(V); - // Dot2 - float32x2_t vTemp = vmul_f32( VL, VL ); - vTemp = vpadd_f32( vTemp, vTemp ); - // Reciprocal sqrt - float32x2_t S0 = vrsqrte_f32(vTemp); - float32x2_t P0 = vmul_f32( vTemp, S0 ); - float32x2_t R0 = vrsqrts_f32( P0, S0 ); - float32x2_t S1 = vmul_f32( S0, R0 ); - float32x2_t P1 = vmul_f32( vTemp, S1 ); - float32x2_t R1 = vrsqrts_f32( P1, S1 ); - float32x2_t Result = vmul_f32( S1, R1 ); - return vcombine_f32( Result, Result ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); - XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); - return _mm_div_ps( g_XMOne, vLengthSq ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_sqrt_ss(vTemp); - vLengthSq = _mm_div_ss(g_XMOne, vLengthSq); - vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); - return vLengthSq; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x and y - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has y splatted - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); - // x+y - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - vLengthSq = _mm_sqrt_ss(vLengthSq); - vLengthSq = _mm_div_ss(g_XMOne,vLengthSq); - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2LengthEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result = XMVector2LengthSq(V); - Result = XMVectorSqrtEst(Result); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32(V); - // Dot2 - float32x2_t vTemp = vmul_f32( VL, VL ); - vTemp = vpadd_f32( vTemp, vTemp ); - const float32x2_t zero = vdup_n_f32(0); - uint32x2_t VEqualsZero = vceq_f32( vTemp, zero ); - // Sqrt (estimate) - float32x2_t Result = vrsqrte_f32( vTemp ); - Result = vmul_f32( vTemp, Result ); - Result = vbsl_f32( VEqualsZero, zero, Result ); - return vcombine_f32( Result, Result ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); - return _mm_sqrt_ps( vTemp ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_sqrt_ss(vTemp); - vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); - return vLengthSq; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x and y - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has y splatted - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); - // x+y - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - vLengthSq = _mm_sqrt_ss(vLengthSq); - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2Length -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result = XMVector2LengthSq(V); - Result = XMVectorSqrt(Result); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32(V); - // Dot2 - float32x2_t vTemp = vmul_f32( VL, VL ); - vTemp = vpadd_f32( vTemp, vTemp ); - const float32x2_t zero = vdup_n_f32(0); - uint32x2_t VEqualsZero = vceq_f32( vTemp, zero ); - // Sqrt - float32x2_t S0 = vrsqrte_f32( vTemp ); - float32x2_t P0 = vmul_f32( vTemp, S0 ); - float32x2_t R0 = vrsqrts_f32( P0, S0 ); - float32x2_t S1 = vmul_f32( S0, R0 ); - float32x2_t P1 = vmul_f32( vTemp, S1 ); - float32x2_t R1 = vrsqrts_f32( P1, S1 ); - float32x2_t Result = vmul_f32( S1, R1 ); - Result = vmul_f32( vTemp, Result ); - Result = vbsl_f32( VEqualsZero, zero, Result ); - return vcombine_f32( Result, Result ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); - return _mm_sqrt_ps( vTemp ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_sqrt_ss(vTemp); - vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); - return vLengthSq; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x and y - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has y splatted - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); - // x+y - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); - vLengthSq = _mm_sqrt_ps(vLengthSq); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ -// XMVector2NormalizeEst uses a reciprocal estimate and -// returns QNaN on zero and infinite vectors. - -inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result = XMVector2ReciprocalLength(V); - Result = XMVectorMultiply(V, Result); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32(V); - // Dot2 - float32x2_t vTemp = vmul_f32( VL, VL ); - vTemp = vpadd_f32( vTemp, vTemp ); - // Reciprocal sqrt (estimate) - vTemp = vrsqrte_f32( vTemp ); - // Normalize - float32x2_t Result = vmul_f32( VL, vTemp ); - return vcombine_f32( Result, Result ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); - XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); - return _mm_mul_ps(vResult, V); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_rsqrt_ss(vLengthSq); - vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); - vLengthSq = _mm_mul_ps(vLengthSq, V); - return vLengthSq; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x and y - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has y splatted - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); - // x+y - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - vLengthSq = _mm_rsqrt_ss(vLengthSq); - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); - vLengthSq = _mm_mul_ps(vLengthSq,V); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2Normalize -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR vResult = XMVector2Length( V ); - float fLength = vResult.vector4_f32[0]; - - // Prevent divide by zero - if (fLength > 0) { - fLength = 1.0f/fLength; - } - - vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; - vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; - vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; - vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; - return vResult; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32(V); - // Dot2 - float32x2_t vTemp = vmul_f32( VL, VL ); - vTemp = vpadd_f32( vTemp, vTemp ); - uint32x2_t VEqualsZero = vceq_f32( vTemp, vdup_n_f32(0) ); - uint32x2_t VEqualsInf = vceq_f32( vTemp, vget_low_f32(g_XMInfinity) ); - // Reciprocal sqrt (2 iterations of Newton-Raphson) - float32x2_t S0 = vrsqrte_f32( vTemp ); - float32x2_t P0 = vmul_f32( vTemp, S0 ); - float32x2_t R0 = vrsqrts_f32( P0, S0 ); - float32x2_t S1 = vmul_f32( S0, R0 ); - float32x2_t P1 = vmul_f32( vTemp, S1 ); - float32x2_t R1 = vrsqrts_f32( P1, S1 ); - vTemp = vmul_f32( S1, R1 ); - // Normalize - float32x2_t Result = vmul_f32( VL, vTemp ); - Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result ); - Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result ); - return vcombine_f32( Result, Result ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f ); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Create zero with a single instruction - XMVECTOR vZeroMask = _mm_setzero_ps(); - // Test for a divide by zero (Must be FP to detect -0.0) - vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Reciprocal mul to perform the normalization - vResult = _mm_div_ps(V,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vZeroMask); - // Select qnan or result based on infinite length - XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); - XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); - vResult = _mm_or_ps(vTemp1,vTemp2); - return vResult; -#elif defined(_XM_SSE3_INTRINSICS_) - // Perform the dot product on x and y only - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_moveldup_ps(vLengthSq); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Create zero with a single instruction - XMVECTOR vZeroMask = _mm_setzero_ps(); - // Test for a divide by zero (Must be FP to detect -0.0) - vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); - // Reciprocal mul to perform the normalization - vResult = _mm_div_ps(V, vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult, vZeroMask); - // Select qnan or result based on infinite length - XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); - XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); - vResult = _mm_or_ps(vTemp1, vTemp2); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x and y only - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Create zero with a single instruction - XMVECTOR vZeroMask = _mm_setzero_ps(); - // Test for a divide by zero (Must be FP to detect -0.0) - vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Reciprocal mul to perform the normalization - vResult = _mm_div_ps(V,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vZeroMask); - // Select qnan or result based on infinite length - XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); - XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); - vResult = _mm_or_ps(vTemp1,vTemp2); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2ClampLength -( - FXMVECTOR V, - float LengthMin, - float LengthMax -) -{ - XMVECTOR ClampMax = XMVectorReplicate(LengthMax); - XMVECTOR ClampMin = XMVectorReplicate(LengthMin); - return XMVector2ClampLengthV(V, ClampMin, ClampMax); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2ClampLengthV -( - FXMVECTOR V, - FXMVECTOR LengthMin, - FXMVECTOR LengthMax -) -{ - assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin))); - assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax))); - assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero)); - assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero)); - assert(XMVector2GreaterOrEqual(LengthMax, LengthMin)); - - XMVECTOR LengthSq = XMVector2LengthSq(V); - - const XMVECTOR Zero = XMVectorZero(); - - XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); - - XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); - XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); - - XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); - - XMVECTOR Normal = XMVectorMultiply(V, RcpLength); - - XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); - Length = XMVectorSelect(LengthSq, Length, Select); - Normal = XMVectorSelect(LengthSq, Normal, Select); - - XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); - XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); - - XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); - ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); - - XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); - - // Preserve the original vector (with no precision loss) if the length falls within the given range - XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); - Result = XMVectorSelect(Result, V, Control); - - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2Reflect -( - FXMVECTOR Incident, - FXMVECTOR Normal -) -{ - // Result = Incident - (2 * dot(Incident, Normal)) * Normal - - XMVECTOR Result; - Result = XMVector2Dot(Incident, Normal); - Result = XMVectorAdd(Result, Result); - Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2Refract -( - FXMVECTOR Incident, - FXMVECTOR Normal, - float RefractionIndex -) -{ - XMVECTOR Index = XMVectorReplicate(RefractionIndex); - return XMVector2RefractV(Incident, Normal, Index); -} - -//------------------------------------------------------------------------------ - -// Return the refraction of a 2D vector -inline XMVECTOR XM_CALLCONV XMVector2RefractV -( - FXMVECTOR Incident, - FXMVECTOR Normal, - FXMVECTOR RefractionIndex -) -{ - // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + - // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) - -#if defined(_XM_NO_INTRINSICS_) - - float IDotN = (Incident.vector4_f32[0]*Normal.vector4_f32[0])+(Incident.vector4_f32[1]*Normal.vector4_f32[1]); - // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) - float RY = 1.0f-(IDotN*IDotN); - float RX = 1.0f-(RY*RefractionIndex.vector4_f32[0]*RefractionIndex.vector4_f32[0]); - RY = 1.0f-(RY*RefractionIndex.vector4_f32[1]*RefractionIndex.vector4_f32[1]); - if (RX>=0.0f) { - RX = (RefractionIndex.vector4_f32[0]*Incident.vector4_f32[0])-(Normal.vector4_f32[0]*((RefractionIndex.vector4_f32[0]*IDotN)+sqrtf(RX))); - } else { - RX = 0.0f; - } - if (RY>=0.0f) { - RY = (RefractionIndex.vector4_f32[1]*Incident.vector4_f32[1])-(Normal.vector4_f32[1]*((RefractionIndex.vector4_f32[1]*IDotN)+sqrtf(RY))); - } else { - RY = 0.0f; - } - - XMVECTOR vResult; - vResult.vector4_f32[0] = RX; - vResult.vector4_f32[1] = RY; - vResult.vector4_f32[2] = 0.0f; - vResult.vector4_f32[3] = 0.0f; - return vResult; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t IL = vget_low_f32( Incident ); - float32x2_t NL = vget_low_f32( Normal ); - float32x2_t RIL = vget_low_f32( RefractionIndex ); - // Get the 2D Dot product of Incident-Normal - float32x2_t vTemp = vmul_f32(IL, NL); - float32x2_t IDotN = vpadd_f32( vTemp, vTemp ); - // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) - vTemp = vmls_f32( vget_low_f32( g_XMOne ), IDotN, IDotN); - vTemp = vmul_f32(vTemp,RIL); - vTemp = vmls_f32(vget_low_f32( g_XMOne ), vTemp, RIL ); - // If any terms are <=0, sqrt() will fail, punt to zero - uint32x2_t vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero) ); - // Sqrt(vTemp) - float32x2_t S0 = vrsqrte_f32(vTemp); - float32x2_t P0 = vmul_f32( vTemp, S0 ); - float32x2_t R0 = vrsqrts_f32( P0, S0 ); - float32x2_t S1 = vmul_f32( S0, R0 ); - float32x2_t P1 = vmul_f32( vTemp, S1 ); - float32x2_t R1 = vrsqrts_f32( P1, S1 ); - float32x2_t S2 = vmul_f32( S1, R1 ); - vTemp = vmul_f32( vTemp, S2 ); - // R = RefractionIndex * IDotN + sqrt(R) - vTemp = vmla_f32( vTemp, RIL, IDotN ); - // Result = RefractionIndex * Incident - Normal * R - float32x2_t vResult = vmul_f32(RIL,IL); - vResult = vmls_f32( vResult, vTemp, NL ); - vResult = vand_u32(vResult,vMask); - return vcombine_f32(vResult, vResult); -#elif defined(_XM_SSE_INTRINSICS_) - // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + - // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) - // Get the 2D Dot product of Incident-Normal - XMVECTOR IDotN = XMVector2Dot(Incident, Normal); - // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) - XMVECTOR vTemp = _mm_mul_ps(IDotN,IDotN); - vTemp = _mm_sub_ps(g_XMOne,vTemp); - vTemp = _mm_mul_ps(vTemp,RefractionIndex); - vTemp = _mm_mul_ps(vTemp,RefractionIndex); - vTemp = _mm_sub_ps(g_XMOne,vTemp); - // If any terms are <=0, sqrt() will fail, punt to zero - XMVECTOR vMask = _mm_cmpgt_ps(vTemp,g_XMZero); - // R = RefractionIndex * IDotN + sqrt(R) - vTemp = _mm_sqrt_ps(vTemp); - XMVECTOR vResult = _mm_mul_ps(RefractionIndex,IDotN); - vTemp = _mm_add_ps(vTemp,vResult); - // Result = RefractionIndex * Incident - Normal * R - vResult = _mm_mul_ps(RefractionIndex,Incident); - vTemp = _mm_mul_ps(vTemp,Normal); - vResult = _mm_sub_ps(vResult,vTemp); - vResult = _mm_and_ps(vResult,vMask); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2Orthogonal -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = -V.vector4_f32[1]; - Result.vector4_f32[1] = V.vector4_f32[0]; - Result.vector4_f32[2] = 0.f; - Result.vector4_f32[3] = 0.f; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 Negate = { -1.f, 1.f, 0, 0 }; - const float32x2_t zero = vdup_n_f32(0); - - float32x2_t VL = vget_low_f32( V ); - float32x2_t Result = vmul_f32( vrev64_f32( VL ), vget_low_f32( Negate ) ); - return vcombine_f32( Result, zero ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); - vResult = _mm_mul_ps(vResult,g_XMNegateX); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst -( - FXMVECTOR N1, - FXMVECTOR N2 -) -{ - XMVECTOR Result = XMVector2Dot(N1, N2); - Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); - Result = XMVectorACosEst(Result); - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals -( - FXMVECTOR N1, - FXMVECTOR N2 -) -{ - XMVECTOR Result = XMVector2Dot(N1, N2); - Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne); - Result = XMVectorACos(Result); - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - XMVECTOR L1 = XMVector2ReciprocalLength(V1); - XMVECTOR L2 = XMVector2ReciprocalLength(V2); - - XMVECTOR Dot = XMVector2Dot(V1, V2); - - L1 = XMVectorMultiply(L1, L2); - - XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); - CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); - - return XMVectorACos(CosAngle); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2LinePointDistance -( - FXMVECTOR LinePoint1, - FXMVECTOR LinePoint2, - FXMVECTOR Point -) -{ - // Given a vector PointVector from LinePoint1 to Point and a vector - // LineVector from LinePoint1 to LinePoint2, the scaled distance - // PointProjectionScale from LinePoint1 to the perpendicular projection - // of PointVector onto the line is defined as: - // - // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) - - XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); - XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); - - XMVECTOR LengthSq = XMVector2LengthSq(LineVector); - - XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector); - PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); - - XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); - DistanceVector = XMVectorSubtract(PointVector, DistanceVector); - - return XMVector2Length(DistanceVector); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2IntersectLine -( - FXMVECTOR Line1Point1, - FXMVECTOR Line1Point2, - FXMVECTOR Line2Point1, - GXMVECTOR Line2Point2 -) -{ -#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) - - XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1); - XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1); - XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1); - - XMVECTOR C1 = XMVector2Cross(V1, V2); - XMVECTOR C2 = XMVector2Cross(V2, V3); - - XMVECTOR Result; - const XMVECTOR Zero = XMVectorZero(); - if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v)) - { - if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v)) - { - // Coincident - Result = g_XMInfinity.v; - } - else - { - // Parallel - Result = g_XMQNaN.v; - } - } - else - { - // Intersection point = Line1Point1 + V1 * (C2 / C1) - XMVECTOR Scale = XMVectorReciprocal(C1); - Scale = XMVectorMultiply(C2, Scale); - Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1); - } - - return Result; - -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1); - XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1); - XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1); - // Generate the cross products - XMVECTOR C1 = XMVector2Cross(V1, V2); - XMVECTOR C2 = XMVector2Cross(V2, V3); - // If C1 is not close to epsilon, use the calculated value - XMVECTOR vResultMask = _mm_setzero_ps(); - vResultMask = _mm_sub_ps(vResultMask,C1); - vResultMask = _mm_max_ps(vResultMask,C1); - // 0xFFFFFFFF if the calculated value is to be used - vResultMask = _mm_cmpgt_ps(vResultMask,g_XMEpsilon); - // If C1 is close to epsilon, which fail type is it? INFINITY or NAN? - XMVECTOR vFailMask = _mm_setzero_ps(); - vFailMask = _mm_sub_ps(vFailMask,C2); - vFailMask = _mm_max_ps(vFailMask,C2); - vFailMask = _mm_cmple_ps(vFailMask,g_XMEpsilon); - XMVECTOR vFail = _mm_and_ps(vFailMask,g_XMInfinity); - vFailMask = _mm_andnot_ps(vFailMask,g_XMQNaN); - // vFail is NAN or INF - vFail = _mm_or_ps(vFail,vFailMask); - // Intersection point = Line1Point1 + V1 * (C2 / C1) - XMVECTOR vResult = _mm_div_ps(C2,C1); - vResult = _mm_mul_ps(vResult,V1); - vResult = _mm_add_ps(vResult,Line1Point1); - // Use result, or failure value - vResult = _mm_and_ps(vResult,vResultMask); - vResultMask = _mm_andnot_ps(vResultMask,vFail); - vResult = _mm_or_ps(vResult,vResultMask); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2Transform -( - FXMVECTOR V, - FXMMATRIX M -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); - Result = XMVectorMultiplyAdd(X, M.r[0], Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32( V ); - float32x4_t Result = vmlaq_lane_f32( M.r[3], M.r[1], VL, 1 ); // Y - return vmlaq_lane_f32( Result, M.r[0], VL, 0 ); // X -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - vResult = _mm_mul_ps(vResult,M.r[0]); - XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - vTemp = _mm_mul_ps(vTemp,M.r[1]); - vResult = _mm_add_ps(vResult,vTemp); - vResult = _mm_add_ps(vResult,M.r[3]); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream -( - XMFLOAT4* pOutputStream, - size_t OutputStride, - const XMFLOAT2* pInputStream, - size_t InputStride, - size_t VectorCount, - FXMMATRIX M -) -{ - assert(pOutputStream != nullptr); - assert(pInputStream != nullptr); - - assert(InputStride >= sizeof(XMFLOAT2)); - _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); - - assert(OutputStride >= sizeof(XMFLOAT4)); - _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); - -#if defined(_XM_NO_INTRINSICS_) - - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row3 = M.r[3]; - - for (size_t i = 0; i < VectorCount; i++) - { - XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector); - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); - Result = XMVectorMultiplyAdd(X, row0, Result); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); - - pInputVector += InputStride; - pOutputVector += OutputStride; - } - - return pOutputStream; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row3 = M.r[3]; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT4))) - { - for (size_t j = 0; j < four; ++j) - { - float32x4x2_t V = vld2q_f32( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT2)*4; - - float32x2_t r3 = vget_low_f32( row3 ); - float32x2_t r = vget_low_f32( row0 ); - XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M - XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N - - __prefetch( pInputVector ); - - r3 = vget_high_f32( row3 ); - r = vget_high_f32( row0 ); - XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O - XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P - - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); - - r = vget_low_f32( row1 ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M - vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); - - r = vget_high_f32( row1 ); - vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O - vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); - - float32x4x4_t R; - R.val[0] = vResult0; - R.val[1] = vResult1; - R.val[2] = vResult2; - R.val[3] = vResult3; - - vst4q_f32( reinterpret_cast(pOutputVector), R ); - pOutputVector += sizeof(XMFLOAT4)*4; - - i += 4; - } - } - } - - for (; i < VectorCount; i++) - { - float32x2_t V = vld1_f32( reinterpret_cast(pInputVector) ); - pInputVector += InputStride; - - XMVECTOR vResult = vmlaq_lane_f32( row3, row0, V, 0 ); // X - vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y - - vst1q_f32( reinterpret_cast(pOutputVector), vResult ); - pOutputVector += OutputStride; - } - - return pOutputStream; -#elif defined(_XM_SSE_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row3 = M.r[3]; - - size_t i = 0; - size_t two = VectorCount >> 1; - if ( two > 0 ) - { - if ( InputStride == sizeof(XMFLOAT2) ) - { - if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) ) - { - // Packed input, aligned output - for (size_t j = 0; j < two; ++j) - { - XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT2)*2; - - XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - - Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); - X = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - - vTemp = _mm_mul_ps( Y, row1 ); - vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - - i += 2; - } - } - else - { - // Packed input, unaligned output - for (size_t j = 0; j < two; ++j) - { - XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT2)*2; - - XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - - Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); - X = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - - vTemp = _mm_mul_ps( Y, row1 ); - vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - - i += 2; - } - } - } - } - - if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) ) - { - if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) ) - { - // Aligned input, aligned output - for (; i < VectorCount; i++) - { - XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast(pInputVector) ) ); - pInputVector += InputStride; - - XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - } - } - else - { - // Aligned input, unaligned output - for (; i < VectorCount; i++) - { - XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast(pInputVector) ) ); - pInputVector += InputStride; - - XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - } - } - } - else - { - // Unaligned input - for (; i < VectorCount; i++) - { - __m128 x = _mm_load_ss( reinterpret_cast(pInputVector) ); - __m128 y = _mm_load_ss( reinterpret_cast(pInputVector+4) ); - pInputVector += InputStride; - - XMVECTOR Y = XM_PERMUTE_PS(y,_MM_SHUFFLE(0,0,0,0)); - XMVECTOR X = XM_PERMUTE_PS(x,_MM_SHUFFLE(0,0,0,0)); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - } - } - - XM_SFENCE(); - - return pOutputStream; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2TransformCoord -( - FXMVECTOR V, - FXMMATRIX M -) -{ - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); - Result = XMVectorMultiplyAdd(X, M.r[0], Result); - - XMVECTOR W = XMVectorSplatW(Result); - return XMVectorDivide( Result, W ); -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream -( - XMFLOAT2* pOutputStream, - size_t OutputStride, - const XMFLOAT2* pInputStream, - size_t InputStride, - size_t VectorCount, - FXMMATRIX M -) -{ - assert(pOutputStream != nullptr); - assert(pInputStream != nullptr); - - assert(InputStride >= sizeof(XMFLOAT2)); - _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); - - assert(OutputStride >= sizeof(XMFLOAT2)); - _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2)); - -#if defined(_XM_NO_INTRINSICS_) - - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row3 = M.r[3]; - - for (size_t i = 0; i < VectorCount; i++) - { - XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector); - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); - Result = XMVectorMultiplyAdd(X, row0, Result); - - XMVECTOR W = XMVectorSplatW(Result); - - Result = XMVectorDivide(Result, W); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat2((XMFLOAT2*)pOutputVector, Result); - - pInputVector += InputStride; - pOutputVector += OutputStride; - } - - return pOutputStream; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row3 = M.r[3]; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2))) - { - for (size_t j = 0; j < four; ++j) - { - float32x4x2_t V = vld2q_f32( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT2)*4; - - float32x2_t r3 = vget_low_f32( row3 ); - float32x2_t r = vget_low_f32( row0 ); - XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M - XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N - - __prefetch( pInputVector ); - - r3 = vget_high_f32( row3 ); - r = vget_high_f32( row0 ); - XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P - - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); - - r = vget_low_f32( row1 ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M - vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); - - r = vget_high_f32( row1 ); - W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); - - // 2 iterations of Newton-Raphson refinement of reciprocal - float32x4_t Reciprocal = vrecpeq_f32(W); - float32x4_t S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - - V.val[0] = vmulq_f32( vResult0, Reciprocal ); - V.val[1] = vmulq_f32( vResult1, Reciprocal ); - - vst2q_f32( reinterpret_cast(pOutputVector),V ); - pOutputVector += sizeof(XMFLOAT2)*4; - - i += 4; - } - } - } - - for (; i < VectorCount; i++) - { - float32x2_t V = vld1_f32( reinterpret_cast(pInputVector) ); - pInputVector += InputStride; - - XMVECTOR vResult = vmlaq_lane_f32( row3, row0, V, 0 ); // X - vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y - - V = vget_high_f32( vResult ); - float32x2_t W = vdup_lane_f32( V, 1 ); - - // 2 iterations of Newton-Raphson refinement of reciprocal for W - float32x2_t Reciprocal = vrecpe_f32( W ); - float32x2_t S = vrecps_f32( Reciprocal, W ); - Reciprocal = vmul_f32( S, Reciprocal ); - S = vrecps_f32( Reciprocal, W ); - Reciprocal = vmul_f32( S, Reciprocal ); - - V = vget_low_f32( vResult ); - V = vmul_f32( V, Reciprocal ); - - vst1_f32( reinterpret_cast(pOutputVector), V ); - pOutputVector += OutputStride; - } - - return pOutputStream; -#elif defined(_XM_SSE_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row3 = M.r[3]; - - size_t i = 0; - size_t two = VectorCount >> 1; - if ( two > 0 ) - { - if ( InputStride == sizeof(XMFLOAT2) ) - { - if ( OutputStride == sizeof(XMFLOAT2) ) - { - if ( !((uintptr_t)pOutputStream & 0xF) ) - { - // Packed input, aligned & packed output - for (size_t j = 0; j < two; ++j) - { - XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT2)*2; - - // Result 1 - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - XMVECTOR V1 = _mm_div_ps( vTemp, W ); - - // Result 2 - Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); - X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); - - vTemp = _mm_mul_ps( Y, row1 ); - vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - XMVECTOR V2 = _mm_div_ps( vTemp, W ); - - vTemp = _mm_movelh_ps( V1, V2 ); - - XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += sizeof(XMFLOAT2)*2; - - i += 2; - } - } - else - { - // Packed input, unaligned & packed output - for (size_t j = 0; j < two; ++j) - { - XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT2)*2; - - // Result 1 - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - XMVECTOR V1 = _mm_div_ps( vTemp, W ); - - // Result 2 - Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); - X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); - - vTemp = _mm_mul_ps( Y, row1 ); - vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - XMVECTOR V2 = _mm_div_ps( vTemp, W ); - - vTemp = _mm_movelh_ps( V1, V2 ); - - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += sizeof(XMFLOAT2)*2; - - i += 2; - } - } - } - else - { - // Packed input, unpacked output - for (size_t j = 0; j < two; ++j) - { - XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT2)*2; - - // Result 1 - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - vTemp = _mm_div_ps( vTemp, W ); - vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); - - _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); - _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); - pOutputVector += OutputStride; - - // Result 2 - Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); - X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); - - vTemp = _mm_mul_ps( Y, row1 ); - vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - vTemp = _mm_div_ps( vTemp, W ); - vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); - - _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); - _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); - pOutputVector += OutputStride; - - i += 2; - } - } - } - } - - if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) ) - { - // Aligned input - for (; i < VectorCount; i++) - { - XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast(pInputVector) ) ); - pInputVector += InputStride; - - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - vTemp = _mm_div_ps( vTemp, W ); - vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); - - _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); - _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); - pOutputVector += OutputStride; - } - } - else - { - // Unaligned input - for (; i < VectorCount; i++) - { - __m128 x = _mm_load_ss( reinterpret_cast(pInputVector) ); - __m128 y = _mm_load_ss( reinterpret_cast(pInputVector+4) ); - pInputVector += InputStride; - - XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) ); - XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - vTemp = _mm_div_ps( vTemp, W ); - vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); - - _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); - _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); - pOutputVector += OutputStride; - } - } - - XM_SFENCE(); - - return pOutputStream; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector2TransformNormal -( - FXMVECTOR V, - FXMMATRIX M -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiply(Y, M.r[1]); - Result = XMVectorMultiplyAdd(X, M.r[0], Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32( V ); - float32x4_t Result = vmulq_lane_f32( M.r[1], VL, 1 ); // Y - return vmlaq_lane_f32( Result, M.r[0], VL, 0 ); // X -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - vResult = _mm_mul_ps(vResult,M.r[0]); - XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - vTemp = _mm_mul_ps(vTemp,M.r[1]); - vResult = _mm_add_ps(vResult,vTemp); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream -( - XMFLOAT2* pOutputStream, - size_t OutputStride, - const XMFLOAT2* pInputStream, - size_t InputStride, - size_t VectorCount, - FXMMATRIX M -) -{ - assert(pOutputStream != nullptr); - assert(pInputStream != nullptr); - - assert(InputStride >= sizeof(XMFLOAT2)); - _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); - - assert(OutputStride >= sizeof(XMFLOAT2)); - _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2)); - -#if defined(_XM_NO_INTRINSICS_) - - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - - for (size_t i = 0; i < VectorCount; i++) - { - XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector); - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiply(Y, row1); - Result = XMVectorMultiplyAdd(X, row0, Result); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat2((XMFLOAT2*)pOutputVector, Result); - - pInputVector += InputStride; - pOutputVector += OutputStride; - } - - return pOutputStream; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2))) - { - for (size_t j = 0; j < four; ++j) - { - float32x4x2_t V = vld2q_f32( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT2)*4; - - float32x2_t r = vget_low_f32( row0 ); - XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax - XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx - - __prefetch( pInputVector ); - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); - - r = vget_low_f32( row1 ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey - vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); - - V.val[0] = vResult0; - V.val[1] = vResult1; - - vst2q_f32( reinterpret_cast(pOutputVector), V ); - pOutputVector += sizeof(XMFLOAT2)*4; - - i += 4; - } - } - } - - for (; i < VectorCount; i++) - { - float32x2_t V = vld1_f32( reinterpret_cast(pInputVector) ); - pInputVector += InputStride; - - XMVECTOR vResult = vmulq_lane_f32( row0, V, 0 ); // X - vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y - - V = vget_low_f32( vResult ); - vst1_f32( reinterpret_cast(pOutputVector), V ); - pOutputVector += OutputStride; - } - - return pOutputStream; -#elif defined(_XM_SSE_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - - size_t i = 0; - size_t two = VectorCount >> 1; - if ( two > 0 ) - { - if ( InputStride == sizeof(XMFLOAT2) ) - { - if ( OutputStride == sizeof(XMFLOAT2) ) - { - if ( !((uintptr_t)pOutputStream & 0xF) ) - { - // Packed input, aligned & packed output - for (size_t j = 0; j < two; ++j) - { - XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT2)*2; - - // Result 1 - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - XMVECTOR V1 = _mm_add_ps( vTemp, vTemp2 ); - - // Result 2 - Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); - X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); - - vTemp = _mm_mul_ps( Y, row1 ); - vTemp2 = _mm_mul_ps( X, row0 ); - XMVECTOR V2 = _mm_add_ps( vTemp, vTemp2 ); - - vTemp = _mm_movelh_ps( V1, V2 ); - - XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += sizeof(XMFLOAT2)*2; - - i += 2; - } - } - else - { - // Packed input, unaligned & packed output - for (size_t j = 0; j < two; ++j) - { - XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT2)*2; - - // Result 1 - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - XMVECTOR V1 = _mm_add_ps( vTemp, vTemp2 ); - - // Result 2 - Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); - X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); - - vTemp = _mm_mul_ps( Y, row1 ); - vTemp2 = _mm_mul_ps( X, row0 ); - XMVECTOR V2 = _mm_add_ps( vTemp, vTemp2 ); - - vTemp = _mm_movelh_ps( V1, V2 ); - - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += sizeof(XMFLOAT2)*2; - - i += 2; - } - } - } - else - { - // Packed input, unpacked output - for (size_t j = 0; j < two; ++j) - { - XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT2)*2; - - // Result 1 - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); - - _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); - _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); - pOutputVector += OutputStride; - - // Result 2 - Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); - X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); - - vTemp = _mm_mul_ps( Y, row1 ); - vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); - - _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); - _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); - pOutputVector += OutputStride; - - i += 2; - } - } - } - } - - if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) ) - { - // Aligned input - for (; i < VectorCount; i++) - { - XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast(pInputVector) ) ); - pInputVector += InputStride; - - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); - - _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); - _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); - pOutputVector += OutputStride; - } - } - else - { - // Unaligned input - for (; i < VectorCount; i++) - { - __m128 x = _mm_load_ss( reinterpret_cast(pInputVector) ); - __m128 y = _mm_load_ss( reinterpret_cast(pInputVector+4) ); - pInputVector += InputStride; - - XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) ); - XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); - - _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); - _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); - pOutputVector += OutputStride; - } - } - - XM_SFENCE(); - - return pOutputStream; -#endif -} - -/**************************************************************************** - * - * 3D Vector - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ -// Comparison operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector3Equal -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); - return (((_mm_movemask_ps(vTemp)&7)==7) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline uint32_t XM_CALLCONV XMVector3EqualR -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - uint32_t CR = 0; - if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && - (V1.vector4_f32[1] == V2.vector4_f32[1]) && - (V1.vector4_f32[2] == V2.vector4_f32[2])) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && - (V1.vector4_f32[1] != V2.vector4_f32[1]) && - (V1.vector4_f32[2] != V2.vector4_f32[2])) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; - - uint32_t CR = 0; - if ( r == 0xFFFFFFU ) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); - int iTest = _mm_movemask_ps(vTemp)&7; - uint32_t CR = 0; - if (iTest==7) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTest) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector3EqualInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_u32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); - return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)==7) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline uint32_t XM_CALLCONV XMVector3EqualIntR -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - uint32_t CR = 0; - if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && - (V1.vector4_u32[1] == V2.vector4_u32[1]) && - (V1.vector4_u32[2] == V2.vector4_u32[2])) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && - (V1.vector4_u32[1] != V2.vector4_u32[1]) && - (V1.vector4_u32[2] != V2.vector4_u32[2])) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_u32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; - - uint32_t CR = 0; - if ( r == 0xFFFFFFU ) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); - int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&7; - uint32_t CR = 0; - if (iTemp==7) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTemp) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector3NearEqual -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR Epsilon -) -{ -#if defined(_XM_NO_INTRINSICS_) - float dx, dy, dz; - - dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); - dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); - dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]); - return (((dx <= Epsilon.vector4_f32[0]) && - (dy <= Epsilon.vector4_f32[1]) && - (dz <= Epsilon.vector4_f32[2])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vDelta = vsubq_f32( V1, V2 ); - uint32x4_t vResult = vacleq_f32( vDelta, Epsilon ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - // Get the difference - XMVECTOR vDelta = _mm_sub_ps(V1,V2); - // Get the absolute value of the difference - XMVECTOR vTemp = _mm_setzero_ps(); - vTemp = _mm_sub_ps(vTemp,vDelta); - vTemp = _mm_max_ps(vTemp,vDelta); - vTemp = _mm_cmple_ps(vTemp,Epsilon); - // w is don't care - return (((_mm_movemask_ps(vTemp)&7)==0x7) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector3NotEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); - return (((_mm_movemask_ps(vTemp)&7)!=7) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector3NotEqualInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_u32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); - return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)!=7) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector3Greater -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcgtq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); - return (((_mm_movemask_ps(vTemp)&7)==7) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline uint32_t XM_CALLCONV XMVector3GreaterR -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - uint32_t CR = 0; - if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && - (V1.vector4_f32[1] > V2.vector4_f32[1]) && - (V1.vector4_f32[2] > V2.vector4_f32[2])) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && - (V1.vector4_f32[1] <= V2.vector4_f32[1]) && - (V1.vector4_f32[2] <= V2.vector4_f32[2])) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcgtq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; - - uint32_t CR = 0; - if ( r == 0xFFFFFFU ) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); - uint32_t CR = 0; - int iTest = _mm_movemask_ps(vTemp)&7; - if (iTest==7) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTest) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector3GreaterOrEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcgeq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); - return (((_mm_movemask_ps(vTemp)&7)==7) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline uint32_t XM_CALLCONV XMVector3GreaterOrEqualR -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - uint32_t CR = 0; - if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && - (V1.vector4_f32[1] >= V2.vector4_f32[1]) && - (V1.vector4_f32[2] >= V2.vector4_f32[2])) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && - (V1.vector4_f32[1] < V2.vector4_f32[1]) && - (V1.vector4_f32[2] < V2.vector4_f32[2])) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcgeq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; - - uint32_t CR = 0; - if ( r == 0xFFFFFFU ) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); - uint32_t CR = 0; - int iTest = _mm_movemask_ps(vTemp)&7; - if (iTest==7) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTest) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector3Less -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcltq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); - return (((_mm_movemask_ps(vTemp)&7)==7) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector3LessOrEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcleq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmple_ps(V1,V2); - return (((_mm_movemask_ps(vTemp)&7)==7) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector3InBounds -( - FXMVECTOR V, - FXMVECTOR Bounds -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && - (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && - (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Test if less than or equal - uint32x4_t ivTemp1 = vcleq_f32(V,Bounds); - // Negate the bounds - float32x4_t vTemp2 = vnegq_f32(Bounds); - // Test if greater or equal (Reversed) - uint32x4_t ivTemp2 = vcleq_f32(vTemp2,V); - // Blend answers - ivTemp1 = vandq_u32(ivTemp1,ivTemp2); - // in bounds? - int8x8x2_t vTemp = vzip_u8(vget_low_u8(ivTemp1), vget_high_u8(ivTemp1)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - // Test if less than or equal - XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); - // Negate the bounds - XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); - // Test if greater or equal (Reversed) - vTemp2 = _mm_cmple_ps(vTemp2,V); - // Blend answers - vTemp1 = _mm_and_ps(vTemp1,vTemp2); - // x,y and z in bounds? (w is don't care) - return (((_mm_movemask_ps(vTemp1)&0x7)==0x7) != 0); -#else - return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds)); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector3IsNaN -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - return (XMISNAN(V.vector4_f32[0]) || - XMISNAN(V.vector4_f32[1]) || - XMISNAN(V.vector4_f32[2])); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Test against itself. NaN is always not equal - uint32x4_t vTempNan = vceqq_f32( V, V ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - // If x or y or z are NaN, the mask is zero - return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - // Test against itself. NaN is always not equal - XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); - // If x or y or z are NaN, the mask is non-zero - return ((_mm_movemask_ps(vTempNan)&7) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector3IsInfinite -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (XMISINF(V.vector4_f32[0]) || - XMISINF(V.vector4_f32[1]) || - XMISINF(V.vector4_f32[2])); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Mask off the sign bit - uint32x4_t vTempInf = vandq_u32( V, g_XMAbsMask ); - // Compare to infinity - vTempInf = vceqq_f32(vTempInf, g_XMInfinity ); - // If any are infinity, the signs are true. - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - // Mask off the sign bit - __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); - // Compare to infinity - vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); - // If x,y or z are infinity, the signs are true. - return ((_mm_movemask_ps(vTemp)&7) != 0); -#endif -} - -//------------------------------------------------------------------------------ -// Computation operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3Dot -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2]; - XMVECTOR vResult; - vResult.vector4_f32[0] = - vResult.vector4_f32[1] = - vResult.vector4_f32[2] = - vResult.vector4_f32[3] = fValue; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vTemp = vmulq_f32( V1, V2 ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vpadd_f32( v1, v1 ); - v2 = vdup_lane_f32( v2, 0 ); - v1 = vadd_f32( v1, v2 ); - return vcombine_f32( v1, v1 ); -#elif defined(_XM_SSE4_INTRINSICS_) - return _mm_dp_ps( V1, V2, 0x7f ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vTemp = _mm_mul_ps(V1,V2); - vTemp = _mm_and_ps(vTemp, g_XMMask3); - vTemp = _mm_hadd_ps(vTemp,vTemp); - return _mm_hadd_ps(vTemp,vTemp); -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product - XMVECTOR vDot = _mm_mul_ps(V1,V2); - // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2] - XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); - // Result.vector4_f32[0] = x+y - vDot = _mm_add_ss(vDot,vTemp); - // x=Dot.vector4_f32[2] - vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); - // Result.vector4_f32[0] = (x+y)+z - vDot = _mm_add_ss(vDot,vTemp); - // Splat x - return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3Cross -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ] - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR vResult = { - (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]), - (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]), - (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]), - 0.0f - }; - return vResult; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t v1xy = vget_low_f32(V1); - float32x2_t v2xy = vget_low_f32(V2); - - float32x2_t v1yx = vrev64_f32( v1xy ); - float32x2_t v2yx = vrev64_f32( v2xy ); - - float32x2_t v1zz = vdup_lane_f32( vget_high_f32(V1), 0 ); - float32x2_t v2zz = vdup_lane_f32( vget_high_f32(V2), 0 ); - - XMVECTOR vResult = vmulq_f32( vcombine_f32(v1yx,v1xy), vcombine_f32(v2zz,v2yx) ); - vResult = vmlsq_f32( vResult, vcombine_f32(v1zz,v1yx), vcombine_f32(v2yx,v2xy) ); - vResult = veorq_u32( vResult, g_XMFlipY ); - return vandq_u32( vResult, g_XMMask3 ); -#elif defined(_XM_SSE_INTRINSICS_) - // y1,z1,x1,w1 - XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(3,0,2,1)); - // z2,x2,y2,w2 - XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(3,1,0,2)); - // Perform the left operation - XMVECTOR vResult = _mm_mul_ps(vTemp1,vTemp2); - // z1,x1,y1,w1 - vTemp1 = XM_PERMUTE_PS(vTemp1,_MM_SHUFFLE(3,0,2,1)); - // y2,z2,x2,w2 - vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(3,1,0,2)); - // Perform the right operation - vTemp1 = _mm_mul_ps(vTemp1,vTemp2); - // Subract the right from left, and return answer - vResult = _mm_sub_ps(vResult,vTemp1); - // Set w to zero - return _mm_and_ps(vResult,g_XMMask3); -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3LengthSq -( - FXMVECTOR V -) -{ - return XMVector3Dot(V, V); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - - Result = XMVector3LengthSq(V); - Result = XMVectorReciprocalSqrtEst(Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Dot3 - float32x4_t vTemp = vmulq_f32( V, V ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vpadd_f32( v1, v1 ); - v2 = vdup_lane_f32( v2, 0 ); - v1 = vadd_f32( v1, v2 ); - // Reciprocal sqrt (estimate) - v2 = vrsqrte_f32( v1 ); - return vcombine_f32(v2, v2); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); - return _mm_rsqrt_ps( vTemp ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_rsqrt_ps(vLengthSq); - return vLengthSq; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x,y and z - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has z and y - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); - // x+z, y - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - // y,y,y,y - vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); - // x+z+y,??,??,?? - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - // Splat the length squared - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); - // Get the reciprocal - vLengthSq = _mm_rsqrt_ps(vLengthSq); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - - Result = XMVector3LengthSq(V); - Result = XMVectorReciprocalSqrt(Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Dot3 - float32x4_t vTemp = vmulq_f32( V, V ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vpadd_f32( v1, v1 ); - v2 = vdup_lane_f32( v2, 0 ); - v1 = vadd_f32( v1, v2 ); - // Reciprocal sqrt - float32x2_t S0 = vrsqrte_f32(v1); - float32x2_t P0 = vmul_f32( v1, S0 ); - float32x2_t R0 = vrsqrts_f32( P0, S0 ); - float32x2_t S1 = vmul_f32( S0, R0 ); - float32x2_t P1 = vmul_f32( v1, S1 ); - float32x2_t R1 = vrsqrts_f32( P1, S1 ); - float32x2_t Result = vmul_f32( S1, R1 ); - return vcombine_f32( Result, Result ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); - XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); - return _mm_div_ps( g_XMOne, vLengthSq ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vDot = _mm_mul_ps(V, V); - vDot = _mm_and_ps(vDot, g_XMMask3); - vDot = _mm_hadd_ps(vDot, vDot); - vDot = _mm_hadd_ps(vDot, vDot); - vDot = _mm_sqrt_ps(vDot); - vDot = _mm_div_ps(g_XMOne,vDot); - return vDot; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product - XMVECTOR vDot = _mm_mul_ps(V,V); - // x=Dot.y, y=Dot.z - XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); - // Result.x = x+y - vDot = _mm_add_ss(vDot,vTemp); - // x=Dot.z - vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); - // Result.x = (x+y)+z - vDot = _mm_add_ss(vDot,vTemp); - // Splat x - vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); - // Get the reciprocal - vDot = _mm_sqrt_ps(vDot); - // Get the reciprocal - vDot = _mm_div_ps(g_XMOne,vDot); - return vDot; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3LengthEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - - Result = XMVector3LengthSq(V); - Result = XMVectorSqrtEst(Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Dot3 - float32x4_t vTemp = vmulq_f32( V, V ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vpadd_f32( v1, v1 ); - v2 = vdup_lane_f32( v2, 0 ); - v1 = vadd_f32( v1, v2 ); - const float32x2_t zero = vdup_n_f32(0); - uint32x2_t VEqualsZero = vceq_f32( v1, zero ); - // Sqrt (estimate) - float32x2_t Result = vrsqrte_f32( v1 ); - Result = vmul_f32( v1, Result ); - Result = vbsl_f32( VEqualsZero, zero, Result ); - return vcombine_f32( Result, Result ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); - return _mm_sqrt_ps( vTemp ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_sqrt_ps(vLengthSq); - return vLengthSq; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x,y and z - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has z and y - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); - // x+z, y - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - // y,y,y,y - vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); - // x+z+y,??,??,?? - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - // Splat the length squared - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); - // Get the length - vLengthSq = _mm_sqrt_ps(vLengthSq); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3Length -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - - Result = XMVector3LengthSq(V); - Result = XMVectorSqrt(Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Dot3 - float32x4_t vTemp = vmulq_f32( V, V ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vpadd_f32( v1, v1 ); - v2 = vdup_lane_f32( v2, 0 ); - v1 = vadd_f32( v1, v2 ); - const float32x2_t zero = vdup_n_f32(0); - uint32x2_t VEqualsZero = vceq_f32( v1, zero ); - // Sqrt - float32x2_t S0 = vrsqrte_f32( v1 ); - float32x2_t P0 = vmul_f32( v1, S0 ); - float32x2_t R0 = vrsqrts_f32( P0, S0 ); - float32x2_t S1 = vmul_f32( S0, R0 ); - float32x2_t P1 = vmul_f32( v1, S1 ); - float32x2_t R1 = vrsqrts_f32( P1, S1 ); - float32x2_t Result = vmul_f32( S1, R1 ); - Result = vmul_f32( v1, Result ); - Result = vbsl_f32( VEqualsZero, zero, Result ); - return vcombine_f32( Result, Result ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); - return _mm_sqrt_ps( vTemp ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_sqrt_ps(vLengthSq); - return vLengthSq; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x,y and z - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has z and y - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); - // x+z, y - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - // y,y,y,y - vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); - // x+z+y,??,??,?? - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - // Splat the length squared - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); - // Get the length - vLengthSq = _mm_sqrt_ps(vLengthSq); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ -// XMVector3NormalizeEst uses a reciprocal estimate and -// returns QNaN on zero and infinite vectors. - -inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result = XMVector3ReciprocalLength(V); - Result = XMVectorMultiply(V, Result); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Dot3 - float32x4_t vTemp = vmulq_f32( V, V ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vpadd_f32( v1, v1 ); - v2 = vdup_lane_f32( v2, 0 ); - v1 = vadd_f32( v1, v2 ); - // Reciprocal sqrt (estimate) - v2 = vrsqrte_f32( v1 ); - // Normalize - return vmulq_f32( V, vcombine_f32(v2,v2) ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); - XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); - return _mm_mul_ps(vResult, V); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vDot = _mm_mul_ps(V, V); - vDot = _mm_and_ps(vDot, g_XMMask3); - vDot = _mm_hadd_ps(vDot, vDot); - vDot = _mm_hadd_ps(vDot, vDot); - vDot = _mm_rsqrt_ps(vDot); - vDot = _mm_mul_ps(vDot,V); - return vDot; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product - XMVECTOR vDot = _mm_mul_ps(V,V); - // x=Dot.y, y=Dot.z - XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); - // Result.x = x+y - vDot = _mm_add_ss(vDot,vTemp); - // x=Dot.z - vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); - // Result.x = (x+y)+z - vDot = _mm_add_ss(vDot,vTemp); - // Splat x - vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); - // Get the reciprocal - vDot = _mm_rsqrt_ps(vDot); - // Perform the normalization - vDot = _mm_mul_ps(vDot,V); - return vDot; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3Normalize -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - float fLength; - XMVECTOR vResult; - - vResult = XMVector3Length( V ); - fLength = vResult.vector4_f32[0]; - - // Prevent divide by zero - if (fLength > 0) { - fLength = 1.0f/fLength; - } - - vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; - vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; - vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; - vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; - return vResult; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Dot3 - float32x4_t vTemp = vmulq_f32( V, V ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vpadd_f32( v1, v1 ); - v2 = vdup_lane_f32( v2, 0 ); - v1 = vadd_f32( v1, v2 ); - uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) ); - uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) ); - // Reciprocal sqrt (2 iterations of Newton-Raphson) - float32x2_t S0 = vrsqrte_f32( v1 ); - float32x2_t P0 = vmul_f32( v1, S0 ); - float32x2_t R0 = vrsqrts_f32( P0, S0 ); - float32x2_t S1 = vmul_f32( S0, R0 ); - float32x2_t P1 = vmul_f32( v1, S1 ); - float32x2_t R1 = vrsqrts_f32( P1, S1 ); - v2 = vmul_f32( S1, R1 ); - // Normalize - XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) ); - vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult ); - return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f ); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Create zero with a single instruction - XMVECTOR vZeroMask = _mm_setzero_ps(); - // Test for a divide by zero (Must be FP to detect -0.0) - vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Divide to perform the normalization - vResult = _mm_div_ps(V,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vZeroMask); - // Select qnan or result based on infinite length - XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); - XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); - vResult = _mm_or_ps(vTemp1,vTemp2); - return vResult; -#elif defined(_XM_SSE3_INTRINSICS_) - // Perform the dot product on x,y and z only - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Create zero with a single instruction - XMVECTOR vZeroMask = _mm_setzero_ps(); - // Test for a divide by zero (Must be FP to detect -0.0) - vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Divide to perform the normalization - vResult = _mm_div_ps(V,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vZeroMask); - // Select qnan or result based on infinite length - XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); - XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); - vResult = _mm_or_ps(vTemp1,vTemp2); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x,y and z only - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1)); - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); - vLengthSq = _mm_add_ss(vLengthSq,vTemp); - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Create zero with a single instruction - XMVECTOR vZeroMask = _mm_setzero_ps(); - // Test for a divide by zero (Must be FP to detect -0.0) - vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Divide to perform the normalization - vResult = _mm_div_ps(V,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vZeroMask); - // Select qnan or result based on infinite length - XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); - XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); - vResult = _mm_or_ps(vTemp1,vTemp2); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3ClampLength -( - FXMVECTOR V, - float LengthMin, - float LengthMax -) -{ - XMVECTOR ClampMax = XMVectorReplicate(LengthMax); - XMVECTOR ClampMin = XMVectorReplicate(LengthMin); - - return XMVector3ClampLengthV(V, ClampMin, ClampMax); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3ClampLengthV -( - FXMVECTOR V, - FXMVECTOR LengthMin, - FXMVECTOR LengthMax -) -{ - assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin))); - assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax))); - assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero())); - assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero())); - assert(XMVector3GreaterOrEqual(LengthMax, LengthMin)); - - XMVECTOR LengthSq = XMVector3LengthSq(V); - - const XMVECTOR Zero = XMVectorZero(); - - XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); - - XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); - XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); - - XMVECTOR Normal = XMVectorMultiply(V, RcpLength); - - XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); - - XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); - Length = XMVectorSelect(LengthSq, Length, Select); - Normal = XMVectorSelect(LengthSq, Normal, Select); - - XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); - XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); - - XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); - ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); - - XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); - - // Preserve the original vector (with no precision loss) if the length falls within the given range - XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); - Result = XMVectorSelect(Result, V, Control); - - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3Reflect -( - FXMVECTOR Incident, - FXMVECTOR Normal -) -{ - // Result = Incident - (2 * dot(Incident, Normal)) * Normal - - XMVECTOR Result = XMVector3Dot(Incident, Normal); - Result = XMVectorAdd(Result, Result); - Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); - - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3Refract -( - FXMVECTOR Incident, - FXMVECTOR Normal, - float RefractionIndex -) -{ - XMVECTOR Index = XMVectorReplicate(RefractionIndex); - return XMVector3RefractV(Incident, Normal, Index); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3RefractV -( - FXMVECTOR Incident, - FXMVECTOR Normal, - FXMVECTOR RefractionIndex -) -{ - // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + - // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) - -#if defined(_XM_NO_INTRINSICS_) - - const XMVECTOR Zero = XMVectorZero(); - - XMVECTOR IDotN = XMVector3Dot(Incident, Normal); - - // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) - XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); - R = XMVectorMultiply(R, RefractionIndex); - R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); - - if (XMVector4LessOrEqual(R, Zero)) - { - // Total internal reflection - return Zero; - } - else - { - // R = RefractionIndex * IDotN + sqrt(R) - R = XMVectorSqrt(R); - R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); - - // Result = RefractionIndex * Incident - Normal * R - XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident); - Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); - - return Result; - } - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR IDotN = XMVector3Dot(Incident,Normal); - - // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) - float32x4_t R = vmlsq_f32( g_XMOne, IDotN, IDotN); - R = vmulq_f32(R, RefractionIndex); - R = vmlsq_f32(g_XMOne, R, RefractionIndex ); - - uint32x4_t vResult = vcleq_f32(R,g_XMZero); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ) - { - // Total internal reflection - vResult = g_XMZero; - } - else - { - // Sqrt(R) - float32x4_t S0 = vrsqrteq_f32(R); - float32x4_t P0 = vmulq_f32( R, S0 ); - float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); - float32x4_t S1 = vmulq_f32( S0, R0 ); - float32x4_t P1 = vmulq_f32( R, S1 ); - float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); - float32x4_t S2 = vmulq_f32( S1, R1 ); - R = vmulq_f32( R, S2 ); - // R = RefractionIndex * IDotN + sqrt(R) - R = vmlaq_f32( R, RefractionIndex, IDotN ); - // Result = RefractionIndex * Incident - Normal * R - vResult = vmulq_f32(RefractionIndex, Incident); - vResult = vmlsq_f32( vResult, R, Normal ); - } - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + - // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) - XMVECTOR IDotN = XMVector3Dot(Incident, Normal); - // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) - XMVECTOR R = _mm_mul_ps(IDotN, IDotN); - R = _mm_sub_ps(g_XMOne,R); - R = _mm_mul_ps(R, RefractionIndex); - R = _mm_mul_ps(R, RefractionIndex); - R = _mm_sub_ps(g_XMOne,R); - - XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero); - if (_mm_movemask_ps(vResult)==0x0f) - { - // Total internal reflection - vResult = g_XMZero; - } - else - { - // R = RefractionIndex * IDotN + sqrt(R) - R = _mm_sqrt_ps(R); - vResult = _mm_mul_ps(RefractionIndex,IDotN); - R = _mm_add_ps(R,vResult); - // Result = RefractionIndex * Incident - Normal * R - vResult = _mm_mul_ps(RefractionIndex, Incident); - R = _mm_mul_ps(R,Normal); - vResult = _mm_sub_ps(vResult,R); - } - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3Orthogonal -( - FXMVECTOR V -) -{ - XMVECTOR Zero = XMVectorZero(); - XMVECTOR Z = XMVectorSplatZ(V); - XMVECTOR YZYY = XMVectorSwizzle(V); - - XMVECTOR NegativeV = XMVectorSubtract(Zero, V); - - XMVECTOR ZIsNegative = XMVectorLess(Z, Zero); - XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero); - - XMVECTOR S = XMVectorAdd(YZYY, Z); - XMVECTOR D = XMVectorSubtract(YZYY, Z); - - XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative); - - XMVECTOR R0 = XMVectorPermute(NegativeV, S); - XMVECTOR R1 = XMVectorPermute(V, D); - - return XMVectorSelect(R1, R0, Select); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst -( - FXMVECTOR N1, - FXMVECTOR N2 -) -{ - XMVECTOR Result = XMVector3Dot(N1, N2); - Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); - Result = XMVectorACosEst(Result); - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals -( - FXMVECTOR N1, - FXMVECTOR N2 -) -{ - XMVECTOR Result = XMVector3Dot(N1, N2); - Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); - Result = XMVectorACos(Result); - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - XMVECTOR L1 = XMVector3ReciprocalLength(V1); - XMVECTOR L2 = XMVector3ReciprocalLength(V2); - - XMVECTOR Dot = XMVector3Dot(V1, V2); - - L1 = XMVectorMultiply(L1, L2); - - XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); - CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); - - return XMVectorACos(CosAngle); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3LinePointDistance -( - FXMVECTOR LinePoint1, - FXMVECTOR LinePoint2, - FXMVECTOR Point -) -{ - // Given a vector PointVector from LinePoint1 to Point and a vector - // LineVector from LinePoint1 to LinePoint2, the scaled distance - // PointProjectionScale from LinePoint1 to the perpendicular projection - // of PointVector onto the line is defined as: - // - // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) - - XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); - XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); - - XMVECTOR LengthSq = XMVector3LengthSq(LineVector); - - XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector); - PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); - - XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); - DistanceVector = XMVectorSubtract(PointVector, DistanceVector); - - return XMVector3Length(DistanceVector); -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline void XM_CALLCONV XMVector3ComponentsFromNormal -( - XMVECTOR* pParallel, - XMVECTOR* pPerpendicular, - FXMVECTOR V, - FXMVECTOR Normal -) -{ - assert(pParallel != nullptr); - assert(pPerpendicular != nullptr); - - XMVECTOR Scale = XMVector3Dot(V, Normal); - - XMVECTOR Parallel = XMVectorMultiply(Normal, Scale); - - *pParallel = Parallel; - *pPerpendicular = XMVectorSubtract(V, Parallel); -} - -//------------------------------------------------------------------------------ -// Transform a vector using a rotation expressed as a unit quaternion - -inline XMVECTOR XM_CALLCONV XMVector3Rotate -( - FXMVECTOR V, - FXMVECTOR RotationQuaternion -) -{ - XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); - XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); - XMVECTOR Result = XMQuaternionMultiply(Q, A); - return XMQuaternionMultiply(Result, RotationQuaternion); -} - -//------------------------------------------------------------------------------ -// Transform a vector using the inverse of a rotation expressed as a unit quaternion - -inline XMVECTOR XM_CALLCONV XMVector3InverseRotate -( - FXMVECTOR V, - FXMVECTOR RotationQuaternion -) -{ - XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); - XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A); - XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); - return XMQuaternionMultiply(Result, Q); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3Transform -( - FXMVECTOR V, - FXMMATRIX M -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Z = XMVectorSplatZ(V); - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); - Result = XMVectorMultiplyAdd(Y, M.r[1], Result); - Result = XMVectorMultiplyAdd(X, M.r[0], Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32( V ); - XMVECTOR vResult = vmlaq_lane_f32( M.r[3], M.r[0], VL, 0 ); // X - vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y - return vmlaq_lane_f32( vResult, M.r[2], vget_high_f32( V ), 0 ); // Z -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - vResult = _mm_mul_ps(vResult,M.r[0]); - XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - vTemp = _mm_mul_ps(vTemp,M.r[1]); - vResult = _mm_add_ps(vResult,vTemp); - vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - vTemp = _mm_mul_ps(vTemp,M.r[2]); - vResult = _mm_add_ps(vResult,vTemp); - vResult = _mm_add_ps(vResult,M.r[3]); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream -( - XMFLOAT4* pOutputStream, - size_t OutputStride, - const XMFLOAT3* pInputStream, - size_t InputStride, - size_t VectorCount, - FXMMATRIX M -) -{ - assert(pOutputStream != nullptr); - assert(pInputStream != nullptr); - - assert(InputStride >= sizeof(XMFLOAT3)); - _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); - - assert(OutputStride >= sizeof(XMFLOAT4)); - _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); - -#if defined(_XM_NO_INTRINSICS_) - - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row2 = M.r[2]; - const XMVECTOR row3 = M.r[3]; - - for (size_t i = 0; i < VectorCount; i++) - { - XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); - XMVECTOR Z = XMVectorSplatZ(V); - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); - Result = XMVectorMultiplyAdd(Y, row1, Result); - Result = XMVectorMultiplyAdd(X, row0, Result); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); - - pInputVector += InputStride; - pOutputVector += OutputStride; - } - - return pOutputStream; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row2 = M.r[2]; - const XMVECTOR row3 = M.r[3]; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT4))) - { - for (size_t j = 0; j < four; ++j) - { - float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT3)*4; - - float32x2_t r3 = vget_low_f32( row3 ); - float32x2_t r = vget_low_f32( row0 ); - XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M - XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N - - __prefetch( pInputVector ); - - r3 = vget_high_f32( row3 ); - r = vget_high_f32( row0 ); - XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O - XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P - - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); - - r = vget_low_f32( row1 ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M - vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); - - r = vget_high_f32( row1 ); - vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O - vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); - - r = vget_low_f32( row2 ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M - vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); - - r = vget_high_f32( row2 ); - vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O - vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz+P - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); - - float32x4x4_t R; - R.val[0] = vResult0; - R.val[1] = vResult1; - R.val[2] = vResult2; - R.val[3] = vResult3; - - vst4q_f32( reinterpret_cast(pOutputVector), R ); - pOutputVector += sizeof(XMFLOAT4)*4; - - i += 4; - } - } - } - - for (; i < VectorCount; i++) - { - float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); - float32x2_t zero = vdup_n_f32(0); - float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); - pInputVector += InputStride; - - XMVECTOR vResult = vmlaq_lane_f32( row3, row0, VL, 0 ); // X - vResult = vmlaq_lane_f32( vResult, row1, VL, 1); // Y - vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z - - vst1q_f32( reinterpret_cast(pOutputVector), vResult ); - pOutputVector += OutputStride; - } - - return pOutputStream; -#elif defined(_XM_SSE_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row2 = M.r[2]; - const XMVECTOR row3 = M.r[3]; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if (InputStride == sizeof(XMFLOAT3)) - { - if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) ) - { - // Packed input, aligned output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - - // Result 2 - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - - // Result 3 - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - - // Result 4 - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - - i += 4; - } - } - else - { - // Packed input, unaligned output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - - // Result 2 - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - - // Result 3 - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - - // Result 4 - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - - i += 4; - } - } - } - } - - if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) ) - { - // Aligned output - for (; i < VectorCount; ++i) - { - #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" ) - XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); - pInputVector += InputStride; - - XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - } - } - else - { - // Unaligned output - for (; i < VectorCount; ++i) - { - #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" ) - XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); - pInputVector += InputStride; - - XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); - pOutputVector += OutputStride; - } - } - - XM_SFENCE(); - - return pOutputStream; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3TransformCoord -( - FXMVECTOR V, - FXMMATRIX M -) -{ - XMVECTOR Z = XMVectorSplatZ(V); - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); - Result = XMVectorMultiplyAdd(Y, M.r[1], Result); - Result = XMVectorMultiplyAdd(X, M.r[0], Result); - - XMVECTOR W = XMVectorSplatW(Result); - return XMVectorDivide( Result, W ); -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream -( - XMFLOAT3* pOutputStream, - size_t OutputStride, - const XMFLOAT3* pInputStream, - size_t InputStride, - size_t VectorCount, - FXMMATRIX M -) -{ - assert(pOutputStream != nullptr); - assert(pInputStream != nullptr); - - assert(InputStride >= sizeof(XMFLOAT3)); - _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); - - assert(OutputStride >= sizeof(XMFLOAT3)); - _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); - -#if defined(_XM_NO_INTRINSICS_) - - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row2 = M.r[2]; - const XMVECTOR row3 = M.r[3]; - - for (size_t i = 0; i < VectorCount; i++) - { - XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); - XMVECTOR Z = XMVectorSplatZ(V); - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); - Result = XMVectorMultiplyAdd(Y, row1, Result); - Result = XMVectorMultiplyAdd(X, row0, Result); - - XMVECTOR W = XMVectorSplatW(Result); - - Result = XMVectorDivide(Result, W); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); - - pInputVector += InputStride; - pOutputVector += OutputStride; - } - - return pOutputStream; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row2 = M.r[2]; - const XMVECTOR row3 = M.r[3]; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) - { - for (size_t j = 0; j < four; ++j) - { - float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT3)*4; - - float32x2_t r3 = vget_low_f32( row3 ); - float32x2_t r = vget_low_f32( row0 ); - XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M - XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N - - __prefetch( pInputVector ); - - r3 = vget_high_f32( row3 ); - r = vget_high_f32( row0 ); - XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O - XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P - - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); - - r = vget_low_f32( row1 ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M - vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); - - r = vget_high_f32( row1 ); - vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O - W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); - - r = vget_low_f32( row2 ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M - vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); - - r = vget_high_f32( row2 ); - vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O - W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); - - // 2 iterations of Newton-Raphson refinement of reciprocal - float32x4_t Reciprocal = vrecpeq_f32(W); - float32x4_t S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - - V.val[0] = vmulq_f32( vResult0, Reciprocal ); - V.val[1] = vmulq_f32( vResult1, Reciprocal ); - V.val[2] = vmulq_f32( vResult2, Reciprocal ); - - vst3q_f32( reinterpret_cast(pOutputVector),V ); - pOutputVector += sizeof(XMFLOAT3)*4; - - i += 4; - } - } - } - - for (; i < VectorCount; i++) - { - float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); - float32x2_t zero = vdup_n_f32(0); - float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); - pInputVector += InputStride; - - XMVECTOR vResult = vmlaq_lane_f32( row3, row0, VL, 0 ); // X - vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y - vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z - - VH = vget_high_f32(vResult); - XMVECTOR W = vdupq_lane_f32( VH, 1 ); - - // 2 iterations of Newton-Raphson refinement of reciprocal for W - float32x4_t Reciprocal = vrecpeq_f32( W ); - float32x4_t S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - - vResult = vmulq_f32( vResult, Reciprocal ); - - VL = vget_low_f32( vResult ); - vst1_f32( reinterpret_cast(pOutputVector), VL ); - vst1q_lane_f32( reinterpret_cast(pOutputVector)+2, vResult, 2 ); - pOutputVector += OutputStride; - } - - return pOutputStream; -#elif defined(_XM_SSE_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row2 = M.r[2]; - const XMVECTOR row3 = M.r[3]; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if (InputStride == sizeof(XMFLOAT3)) - { - if (OutputStride == sizeof(XMFLOAT3)) - { - if ( !((uintptr_t)pOutputStream & 0xF) ) - { - // Packed input, aligned & packed output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - V1 = _mm_div_ps( vTemp, W ); - - // Result 2 - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - V2 = _mm_div_ps( vTemp, W ); - - // Result 3 - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - V3 = _mm_div_ps( vTemp, W ); - - // Result 4 - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - V4 = _mm_div_ps( vTemp, W ); - - // Pack and store the vectors - XM3PACK4INTO3(vTemp); - XM_STREAM_PS( reinterpret_cast(pOutputVector), V1 ); - XM_STREAM_PS( reinterpret_cast(pOutputVector+16), vTemp ); - XM_STREAM_PS( reinterpret_cast(pOutputVector+32), V3 ); - pOutputVector += sizeof(XMFLOAT3)*4; - i += 4; - } - } - else - { - // Packed input, unaligned & packed output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - V1 = _mm_div_ps( vTemp, W ); - - // Result 2 - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - V2 = _mm_div_ps( vTemp, W ); - - // Result 3 - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - V3 = _mm_div_ps( vTemp, W ); - - // Result 4 - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - V4 = _mm_div_ps( vTemp, W ); - - // Pack and store the vectors - XM3PACK4INTO3(vTemp); - _mm_storeu_ps( reinterpret_cast(pOutputVector), V1 ); - _mm_storeu_ps( reinterpret_cast(pOutputVector+16), vTemp ); - _mm_storeu_ps( reinterpret_cast(pOutputVector+32), V3 ); - pOutputVector += sizeof(XMFLOAT3)*4; - i += 4; - } - } - } - else - { - // Packed input, unpacked output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - vTemp = _mm_div_ps( vTemp, W ); - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - // Result 2 - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - vTemp = _mm_div_ps( vTemp, W ); - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - // Result 3 - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - vTemp = _mm_div_ps( vTemp, W ); - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - // Result 4 - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - vTemp = _mm_div_ps( vTemp, W ); - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - i += 4; - } - } - } - } - - for (; i < VectorCount; i++) - { - #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" ) - XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); - pInputVector += InputStride; - - XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, row3 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - - vTemp = _mm_div_ps( vTemp, W ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - } - - XM_SFENCE(); - - return pOutputStream; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3TransformNormal -( - FXMVECTOR V, - FXMMATRIX M -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Z = XMVectorSplatZ(V); - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiply(Z, M.r[2]); - Result = XMVectorMultiplyAdd(Y, M.r[1], Result); - Result = XMVectorMultiplyAdd(X, M.r[0], Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32( V ); - XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X - vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y - return vmlaq_lane_f32( vResult, M.r[2], vget_high_f32( V ), 0 ); // Z -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - vResult = _mm_mul_ps(vResult,M.r[0]); - XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - vTemp = _mm_mul_ps(vTemp,M.r[1]); - vResult = _mm_add_ps(vResult,vTemp); - vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - vTemp = _mm_mul_ps(vTemp,M.r[2]); - vResult = _mm_add_ps(vResult,vTemp); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream -( - XMFLOAT3* pOutputStream, - size_t OutputStride, - const XMFLOAT3* pInputStream, - size_t InputStride, - size_t VectorCount, - FXMMATRIX M -) -{ - assert(pOutputStream != nullptr); - assert(pInputStream != nullptr); - - assert(InputStride >= sizeof(XMFLOAT3)); - _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); - - assert(OutputStride >= sizeof(XMFLOAT3)); - _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); - -#if defined(_XM_NO_INTRINSICS_) - - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row2 = M.r[2]; - - for (size_t i = 0; i < VectorCount; i++) - { - XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); - XMVECTOR Z = XMVectorSplatZ(V); - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiply(Z, row2); - Result = XMVectorMultiplyAdd(Y, row1, Result); - Result = XMVectorMultiplyAdd(X, row0, Result); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); - - pInputVector += InputStride; - pOutputVector += OutputStride; - } - - return pOutputStream; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row2 = M.r[2]; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) - { - for (size_t j = 0; j < four; ++j) - { - float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT3)*4; - - float32x2_t r = vget_low_f32( row0 ); - XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax - XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx - - __prefetch( pInputVector ); - - r = vget_high_f32( row0 ); - XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx - - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); - - r = vget_low_f32( row1 ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey - vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); - - r = vget_high_f32( row1 ); - vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); - - r = vget_low_f32( row2 ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz - vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); - - r = vget_high_f32( row2 ); - vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); - - V.val[0] = vResult0; - V.val[1] = vResult1; - V.val[2] = vResult2; - - vst3q_f32( reinterpret_cast(pOutputVector), V ); - pOutputVector += sizeof(XMFLOAT3)*4; - - i += 4; - } - } - } - - for (; i < VectorCount; i++) - { - float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); - float32x2_t zero = vdup_n_f32(0); - float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); - pInputVector += InputStride; - - XMVECTOR vResult = vmulq_lane_f32( row0, VL, 0 ); // X - vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y - vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z - - VL = vget_low_f32( vResult ); - vst1_f32( reinterpret_cast(pOutputVector), VL ); - vst1q_lane_f32( reinterpret_cast(pOutputVector)+2, vResult, 2 ); - pOutputVector += OutputStride; - } - - return pOutputStream; -#elif defined(_XM_SSE_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row2 = M.r[2]; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if (InputStride == sizeof(XMFLOAT3)) - { - if (OutputStride == sizeof(XMFLOAT3)) - { - if ( !((uintptr_t)pOutputStream & 0xF) ) - { - // Packed input, aligned & packed output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - V1 = _mm_add_ps( vTemp, vTemp3 ); - - // Result 2 - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - V2 = _mm_add_ps( vTemp, vTemp3 ); - - // Result 3 - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - V3 = _mm_add_ps( vTemp, vTemp3 ); - - // Result 4 - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - V4 = _mm_add_ps( vTemp, vTemp3 ); - - // Pack and store the vectors - XM3PACK4INTO3(vTemp); - XM_STREAM_PS( reinterpret_cast(pOutputVector), V1 ); - XM_STREAM_PS( reinterpret_cast(pOutputVector+16), vTemp ); - XM_STREAM_PS( reinterpret_cast(pOutputVector+32), V3 ); - pOutputVector += sizeof(XMFLOAT3)*4; - i += 4; - } - } - else - { - // Packed input, unaligned & packed output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - V1 = _mm_add_ps( vTemp, vTemp3 ); - - // Result 2 - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - V2 = _mm_add_ps( vTemp, vTemp3 ); - - // Result 3 - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - V3 = _mm_add_ps( vTemp, vTemp3 ); - - // Result 4 - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - V4 = _mm_add_ps( vTemp, vTemp3 ); - - // Pack and store the vectors - XM3PACK4INTO3(vTemp); - _mm_storeu_ps( reinterpret_cast(pOutputVector), V1 ); - _mm_storeu_ps( reinterpret_cast(pOutputVector+16), vTemp ); - _mm_storeu_ps( reinterpret_cast(pOutputVector+32), V3 ); - pOutputVector += sizeof(XMFLOAT3)*4; - i += 4; - } - } - } - else - { - // Packed input, unpacked output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - // Result 2 - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - // Result 3 - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - // Result 4 - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, row2 ); - vTemp2 = _mm_mul_ps( Y, row1 ); - vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - i += 4; - } - } - } - } - - for (; i < VectorCount; i++) - { - #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" ) - XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); - pInputVector += InputStride; - - XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); - XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - } - - XM_SFENCE(); - - return pOutputStream; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3Project -( - FXMVECTOR V, - float ViewportX, - float ViewportY, - float ViewportWidth, - float ViewportHeight, - float ViewportMinZ, - float ViewportMaxZ, - FXMMATRIX Projection, - CXMMATRIX View, - CXMMATRIX World -) -{ - const float HalfViewportWidth = ViewportWidth * 0.5f; - const float HalfViewportHeight = ViewportHeight * 0.5f; - - XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); - XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); - - XMMATRIX Transform = XMMatrixMultiply(World, View); - Transform = XMMatrixMultiply(Transform, Projection); - - XMVECTOR Result = XMVector3TransformCoord(V, Transform); - - Result = XMVectorMultiplyAdd(Result, Scale, Offset); - - return Result; -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream -( - XMFLOAT3* pOutputStream, - size_t OutputStride, - const XMFLOAT3* pInputStream, - size_t InputStride, - size_t VectorCount, - float ViewportX, - float ViewportY, - float ViewportWidth, - float ViewportHeight, - float ViewportMinZ, - float ViewportMaxZ, - FXMMATRIX Projection, - CXMMATRIX View, - CXMMATRIX World -) -{ - assert(pOutputStream != nullptr); - assert(pInputStream != nullptr); - - assert(InputStride >= sizeof(XMFLOAT3)); - _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); - - assert(OutputStride >= sizeof(XMFLOAT3)); - _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); - -#if defined(_XM_NO_INTRINSICS_) - - const float HalfViewportWidth = ViewportWidth * 0.5f; - const float HalfViewportHeight = ViewportHeight * 0.5f; - - XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); - XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); - - XMMATRIX Transform = XMMatrixMultiply(World, View); - Transform = XMMatrixMultiply(Transform, Projection); - - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - for (size_t i = 0; i < VectorCount; i++) - { - XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); - - XMVECTOR Result = XMVector3TransformCoord(V, Transform); - Result = XMVectorMultiplyAdd(Result, Scale, Offset); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); - - pInputVector += InputStride; - pOutputVector += OutputStride; - } - - return pOutputStream; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - const float HalfViewportWidth = ViewportWidth * 0.5f; - const float HalfViewportHeight = ViewportHeight * 0.5f; - - XMMATRIX Transform = XMMatrixMultiply(World, View); - Transform = XMMatrixMultiply(Transform, Projection); - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) - { - XMVECTOR ScaleX = vdupq_n_f32(HalfViewportWidth); - XMVECTOR ScaleY = vdupq_n_f32(-HalfViewportHeight); - XMVECTOR ScaleZ = vdupq_n_f32(ViewportMaxZ - ViewportMinZ); - - XMVECTOR OffsetX = vdupq_n_f32(ViewportX + HalfViewportWidth); - XMVECTOR OffsetY = vdupq_n_f32(ViewportY + HalfViewportHeight); - XMVECTOR OffsetZ = vdupq_n_f32(ViewportMinZ); - - for (size_t j = 0; j < four; ++j) - { - float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT3)*4; - - float32x2_t r3 = vget_low_f32( Transform.r[3] ); - float32x2_t r = vget_low_f32( Transform.r[0] ); - XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M - XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N - - __prefetch( pInputVector ); - - r3 = vget_high_f32( Transform.r[3] ); - r = vget_high_f32( Transform.r[0] ); - XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O - XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P - - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); - - r = vget_low_f32( Transform.r[1] ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M - vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); - - r = vget_high_f32( Transform.r[1] ); - vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O - W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); - - r = vget_low_f32( Transform.r[2] ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M - vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); - - r = vget_high_f32( Transform.r[2] ); - vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O - W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); - - // 2 iterations of Newton-Raphson refinement of reciprocal - float32x4_t Reciprocal = vrecpeq_f32(W); - float32x4_t S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - - vResult0 = vmulq_f32( vResult0, Reciprocal ); - vResult1 = vmulq_f32( vResult1, Reciprocal ); - vResult2 = vmulq_f32( vResult2, Reciprocal ); - - V.val[0] = vmlaq_f32( OffsetX, vResult0, ScaleX ); - V.val[1] = vmlaq_f32( OffsetY, vResult1, ScaleY ); - V.val[2] = vmlaq_f32( OffsetZ, vResult2, ScaleZ ); - - vst3q_f32( reinterpret_cast(pOutputVector),V ); - pOutputVector += sizeof(XMFLOAT3)*4; - - i += 4; - } - } - } - - if ( i < VectorCount) - { - XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); - XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); - - for (; i < VectorCount; i++) - { - float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); - float32x2_t zero = vdup_n_f32(0); - float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); - pInputVector += InputStride; - - XMVECTOR vResult = vmlaq_lane_f32( Transform.r[3], Transform.r[0], VL, 0 ); // X - vResult = vmlaq_lane_f32( vResult, Transform.r[1], VL, 1 ); // Y - vResult = vmlaq_lane_f32( vResult, Transform.r[2], VH, 0 ); // Z - - VH = vget_high_f32(vResult); - XMVECTOR W = vdupq_lane_f32( VH, 1 ); - - // 2 iterations of Newton-Raphson refinement of reciprocal for W - float32x4_t Reciprocal = vrecpeq_f32( W ); - float32x4_t S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - - vResult = vmulq_f32( vResult, Reciprocal ); - - vResult = vmlaq_f32( Offset, vResult, Scale ); - - VL = vget_low_f32( vResult ); - vst1_f32( reinterpret_cast(pOutputVector), VL ); - vst1q_lane_f32( reinterpret_cast(pOutputVector)+2, vResult, 2 ); - pOutputVector += OutputStride; - } - } - - return pOutputStream; -#elif defined(_XM_SSE_INTRINSICS_) - const float HalfViewportWidth = ViewportWidth * 0.5f; - const float HalfViewportHeight = ViewportHeight * 0.5f; - - XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); - XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); - - XMMATRIX Transform = XMMatrixMultiply(World, View); - Transform = XMMatrixMultiply(Transform, Projection); - - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if (InputStride == sizeof(XMFLOAT3)) - { - if (OutputStride == sizeof(XMFLOAT3)) - { - if ( !((uintptr_t)pOutputStream & 0xF) ) - { - // Packed input, aligned & packed output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - V1 = _mm_add_ps( vTemp, Offset ); - - // Result 2 - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - V2 = _mm_add_ps( vTemp, Offset ); - - // Result 3 - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - V3 = _mm_add_ps( vTemp, Offset ); - - // Result 4 - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - V4 = _mm_add_ps( vTemp, Offset ); - - // Pack and store the vectors - XM3PACK4INTO3(vTemp); - XM_STREAM_PS( reinterpret_cast(pOutputVector), V1 ); - XM_STREAM_PS( reinterpret_cast(pOutputVector+16), vTemp ); - XM_STREAM_PS( reinterpret_cast(pOutputVector+32), V3 ); - pOutputVector += sizeof(XMFLOAT3)*4; - i += 4; - } - } - else - { - // Packed input, unaligned & packed output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - V1 = _mm_add_ps( vTemp, Offset ); - - // Result 2 - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - V2 = _mm_add_ps( vTemp, Offset ); - - // Result 3 - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - V3 = _mm_add_ps( vTemp, Offset ); - - // Result 4 - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - V4 = _mm_add_ps( vTemp, Offset ); - - // Pack and store the vectors - XM3PACK4INTO3(vTemp); - _mm_storeu_ps( reinterpret_cast(pOutputVector), V1 ); - _mm_storeu_ps( reinterpret_cast(pOutputVector+16), vTemp ); - _mm_storeu_ps( reinterpret_cast(pOutputVector+32), V3 ); - pOutputVector += sizeof(XMFLOAT3)*4; - i += 4; - } - } - } - else - { - // Packed input, unpacked output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - vTemp = _mm_add_ps( vTemp, Offset ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - // Result 2 - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - vTemp = _mm_add_ps( vTemp, Offset ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - // Result 3 - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - vTemp = _mm_add_ps( vTemp, Offset ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - // Result 4 - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - vTemp = _mm_add_ps( vTemp, Offset ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - i += 4; - } - } - } - } - - for (; i < VectorCount; i++) - { - #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" ) - XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); - pInputVector += InputStride; - - XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - vTemp = _mm_mul_ps( vTemp, Scale ); - vTemp = _mm_add_ps( vTemp, Offset ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - } - - XM_SFENCE(); - - return pOutputStream; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector3Unproject -( - FXMVECTOR V, - float ViewportX, - float ViewportY, - float ViewportWidth, - float ViewportHeight, - float ViewportMinZ, - float ViewportMaxZ, - FXMMATRIX Projection, - CXMMATRIX View, - CXMMATRIX World -) -{ - static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; - - XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); - Scale = XMVectorReciprocal(Scale); - - XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); - Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); - - XMMATRIX Transform = XMMatrixMultiply(World, View); - Transform = XMMatrixMultiply(Transform, Projection); - Transform = XMMatrixInverse(nullptr, Transform); - - XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); - - return XMVector3TransformCoord(Result, Transform); -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream -( - XMFLOAT3* pOutputStream, - size_t OutputStride, - const XMFLOAT3* pInputStream, - size_t InputStride, - size_t VectorCount, - float ViewportX, - float ViewportY, - float ViewportWidth, - float ViewportHeight, - float ViewportMinZ, - float ViewportMaxZ, - FXMMATRIX Projection, - CXMMATRIX View, - CXMMATRIX World) -{ - assert(pOutputStream != nullptr); - assert(pInputStream != nullptr); - - assert(InputStride >= sizeof(XMFLOAT3)); - _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); - - assert(OutputStride >= sizeof(XMFLOAT3)); - _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); - -#if defined(_XM_NO_INTRINSICS_) - - static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; - - XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); - Scale = XMVectorReciprocal(Scale); - - XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); - Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); - - XMMATRIX Transform = XMMatrixMultiply(World, View); - Transform = XMMatrixMultiply(Transform, Projection); - Transform = XMMatrixInverse(nullptr, Transform); - - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - for (size_t i = 0; i < VectorCount; i++) - { - XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); - - XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); - - Result = XMVector3TransformCoord(Result, Transform); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); - - pInputVector += InputStride; - pOutputVector += OutputStride; - } - - return pOutputStream; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMMATRIX Transform = XMMatrixMultiply(World, View); - Transform = XMMatrixMultiply(Transform, Projection); - Transform = XMMatrixInverse(nullptr, Transform); - - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - float sx = 1.f / (ViewportWidth * 0.5f); - float sy = 1.f / (-ViewportHeight * 0.5f); - float sz = 1.f / (ViewportMaxZ - ViewportMinZ); - - float ox = (-ViewportX * sx) - 1.f; - float oy = (-ViewportY * sy) + 1.f; - float oz = (-ViewportMinZ * sz); - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) - { - for (size_t j = 0; j < four; ++j) - { - float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT3)*4; - - XMVECTOR ScaleX = vdupq_n_f32(sx); - XMVECTOR OffsetX = vdupq_n_f32(ox); - XMVECTOR VX = vmlaq_f32( OffsetX, ScaleX, V.val[0] ); - - float32x2_t r3 = vget_low_f32( Transform.r[3] ); - float32x2_t r = vget_low_f32( Transform.r[0] ); - XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Ax+M - XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Bx+N - - __prefetch( pInputVector ); - - r3 = vget_high_f32( Transform.r[3] ); - r = vget_high_f32( Transform.r[0] ); - XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Cx+O - XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Dx+P - - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); - - XMVECTOR ScaleY = vdupq_n_f32(sy); - XMVECTOR OffsetY = vdupq_n_f32(oy); - XMVECTOR VY = vmlaq_f32( OffsetY, ScaleY, V.val[1] ); - - r = vget_low_f32( Transform.r[1] ); - vResult0 = vmlaq_lane_f32( vResult0, VY, r, 0 ); // Ax+Ey+M - vResult1 = vmlaq_lane_f32( vResult1, VY, r, 1 ); // Bx+Fy+N - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); - - r = vget_high_f32( Transform.r[1] ); - vResult2 = vmlaq_lane_f32( vResult2, VY, r, 0 ); // Cx+Gy+O - W = vmlaq_lane_f32( W, VY, r, 1 ); // Dx+Hy+P - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); - - XMVECTOR ScaleZ = vdupq_n_f32(sz); - XMVECTOR OffsetZ = vdupq_n_f32(oz); - XMVECTOR VZ = vmlaq_f32( OffsetZ, ScaleZ, V.val[2] ); - - r = vget_low_f32( Transform.r[2] ); - vResult0 = vmlaq_lane_f32( vResult0, VZ, r, 0 ); // Ax+Ey+Iz+M - vResult1 = vmlaq_lane_f32( vResult1, VZ, r, 1 ); // Bx+Fy+Jz+N - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); - - r = vget_high_f32( Transform.r[2] ); - vResult2 = vmlaq_lane_f32( vResult2, VZ, r, 0 ); // Cx+Gy+Kz+O - W = vmlaq_lane_f32( W, VZ, r, 1 ); // Dx+Hy+Lz+P - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); - - // 2 iterations of Newton-Raphson refinement of reciprocal - float32x4_t Reciprocal = vrecpeq_f32(W); - float32x4_t S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - - V.val[0] = vmulq_f32( vResult0, Reciprocal ); - V.val[1] = vmulq_f32( vResult1, Reciprocal ); - V.val[2] = vmulq_f32( vResult2, Reciprocal ); - - vst3q_f32( reinterpret_cast(pOutputVector),V ); - pOutputVector += sizeof(XMFLOAT3)*4; - - i += 4; - } - } - } - - if (i < VectorCount) - { - float32x2_t ScaleL = vcreate_f32(((uint64_t)*(const uint32_t *)&sx) | ((uint64_t)(*(const uint32_t *)&sy) << 32)); - float32x2_t ScaleH = vcreate_f32((uint64_t)*(const uint32_t *)&sz); - - float32x2_t OffsetL = vcreate_f32(((uint64_t)*(const uint32_t *)&ox) | ((uint64_t)(*(const uint32_t *)&oy) << 32)); - float32x2_t OffsetH = vcreate_f32((uint64_t)*(const uint32_t *)&oz); - - for (; i < VectorCount; i++) - { - float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); - float32x2_t zero = vdup_n_f32(0); - float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); - pInputVector += InputStride; - - VL = vmla_f32( OffsetL, VL, ScaleL ); - VH = vmla_f32( OffsetH, VH, ScaleH ); - - XMVECTOR vResult = vmlaq_lane_f32( Transform.r[3], Transform.r[0], VL, 0 ); // X - vResult = vmlaq_lane_f32( vResult, Transform.r[1], VL, 1 ); // Y - vResult = vmlaq_lane_f32( vResult, Transform.r[2], VH, 0 ); // Z - - VH = vget_high_f32(vResult); - XMVECTOR W = vdupq_lane_f32( VH, 1 ); - - // 2 iterations of Newton-Raphson refinement of reciprocal for W - float32x4_t Reciprocal = vrecpeq_f32( W ); - float32x4_t S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - S = vrecpsq_f32( Reciprocal, W ); - Reciprocal = vmulq_f32( S, Reciprocal ); - - vResult = vmulq_f32( vResult, Reciprocal ); - - VL = vget_low_f32( vResult ); - vst1_f32( reinterpret_cast(pOutputVector), VL ); - vst1q_lane_f32( reinterpret_cast(pOutputVector)+2, vResult, 2 ); - pOutputVector += OutputStride; - } - } - - return pOutputStream; -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; - - XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); - Scale = XMVectorReciprocal(Scale); - - XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); - Offset = _mm_mul_ps(Scale, Offset); - Offset = _mm_add_ps(Offset, D); - - XMMATRIX Transform = XMMatrixMultiply(World, View); - Transform = XMMatrixMultiply(Transform, Projection); - Transform = XMMatrixInverse(nullptr, Transform); - - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if (InputStride == sizeof(XMFLOAT3)) - { - if (OutputStride == sizeof(XMFLOAT3)) - { - if ( !((uintptr_t)pOutputStream & 0xF) ) - { - // Packed input, aligned & packed output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - V1 = _mm_mul_ps( V1, Scale ); - V1 = _mm_add_ps( V1, Offset ); - - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - V1 = _mm_div_ps( vTemp, W ); - - // Result 2 - V2 = _mm_mul_ps( V2, Scale ); - V2 = _mm_add_ps( V2, Offset ); - - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - V2 = _mm_div_ps( vTemp, W ); - - // Result 3 - V3 = _mm_mul_ps( V3, Scale ); - V3 = _mm_add_ps( V3, Offset ); - - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - V3 = _mm_div_ps( vTemp, W ); - - // Result 4 - V4 = _mm_mul_ps( V4, Scale ); - V4 = _mm_add_ps( V4, Offset ); - - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - V4 = _mm_div_ps( vTemp, W ); - - // Pack and store the vectors - XM3PACK4INTO3(vTemp); - XM_STREAM_PS( reinterpret_cast(pOutputVector), V1 ); - XM_STREAM_PS( reinterpret_cast(pOutputVector+16), vTemp ); - XM_STREAM_PS( reinterpret_cast(pOutputVector+32), V3 ); - pOutputVector += sizeof(XMFLOAT3)*4; - i += 4; - } - } - else - { - // Packed input, unaligned & packed output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - V1 = _mm_mul_ps( V1, Scale ); - V1 = _mm_add_ps( V1, Offset ); - - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - V1 = _mm_div_ps( vTemp, W ); - - // Result 2 - V2 = _mm_mul_ps( V2, Scale ); - V2 = _mm_add_ps( V2, Offset ); - - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - V2 = _mm_div_ps( vTemp, W ); - - // Result 3 - V3 = _mm_mul_ps( V3, Scale ); - V3 = _mm_add_ps( V3, Offset ); - - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - V3 = _mm_div_ps( vTemp, W ); - - // Result 4 - V4 = _mm_mul_ps( V4, Scale ); - V4 = _mm_add_ps( V4, Offset ); - - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - V4 = _mm_div_ps( vTemp, W ); - - // Pack and store the vectors - XM3PACK4INTO3(vTemp); - _mm_storeu_ps( reinterpret_cast(pOutputVector), V1 ); - _mm_storeu_ps( reinterpret_cast(pOutputVector+16), vTemp ); - _mm_storeu_ps( reinterpret_cast(pOutputVector+32), V3 ); - pOutputVector += sizeof(XMFLOAT3)*4; - i += 4; - } - } - } - else - { - // Packed input, unpacked output - for (size_t j = 0; j < four; ++j) - { - __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); - __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); - pInputVector += sizeof(XMFLOAT3)*4; - - // Unpack the 4 vectors (.w components are junk) - XM3UNPACK3INTO4(V1,L2,L3); - - // Result 1 - V1 = _mm_mul_ps( V1, Scale ); - V1 = _mm_add_ps( V1, Offset ); - - XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - // Result 2 - V2 = _mm_mul_ps( V2, Scale ); - V2 = _mm_add_ps( V2, Offset ); - - Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - // Result 3 - V3 = _mm_mul_ps( V3, Scale ); - V3 = _mm_add_ps( V3, Offset ); - - Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - // Result 4 - V4 = _mm_mul_ps( V4, Scale ); - V4 = _mm_add_ps( V4, Offset ); - - Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); - Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); - X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); - - vTemp = _mm_mul_ps( Z, Transform.r[2] ); - vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - - i += 4; - } - } - } - } - - for (; i < VectorCount; i++) - { - #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" ) - XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); - pInputVector += InputStride; - - V = _mm_mul_ps( V, Scale ); - V = _mm_add_ps( V, Offset ); - - XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); - XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); - XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); - - XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); - XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); - XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); - vTemp = _mm_add_ps( vTemp, Transform.r[3] ); - vTemp = _mm_add_ps( vTemp, vTemp2 ); - vTemp = _mm_add_ps( vTemp, vTemp3 ); - - XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); - vTemp = _mm_div_ps( vTemp, W ); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); - pOutputVector += OutputStride; - } - - XM_SFENCE(); - - return pOutputStream; -#endif -} - -/**************************************************************************** - * - * 4D Vector - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ -// Comparison operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector4Equal -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); - return ((_mm_movemask_ps(vTemp)==0x0f) != 0); -#else - return XMComparisonAllTrue(XMVector4EqualR(V1, V2)); -#endif -} - -//------------------------------------------------------------------------------ - -inline uint32_t XM_CALLCONV XMVector4EqualR -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - uint32_t CR = 0; - - if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && - (V1.vector4_f32[1] == V2.vector4_f32[1]) && - (V1.vector4_f32[2] == V2.vector4_f32[2]) && - (V1.vector4_f32[3] == V2.vector4_f32[3])) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && - (V1.vector4_f32[1] != V2.vector4_f32[1]) && - (V1.vector4_f32[2] != V2.vector4_f32[2]) && - (V1.vector4_f32[3] != V2.vector4_f32[3])) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1); - - uint32_t CR = 0; - if ( r == 0xFFFFFFFFU ) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); - int iTest = _mm_movemask_ps(vTemp); - uint32_t CR = 0; - if (iTest==0xf) // All equal? - { - CR = XM_CRMASK_CR6TRUE; - } - else if (iTest==0) // All not equal? - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector4EqualInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_u32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); - return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))==0xf) != 0); -#else - return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2)); -#endif -} - -//------------------------------------------------------------------------------ - -inline uint32_t XM_CALLCONV XMVector4EqualIntR -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - uint32_t CR = 0; - if (V1.vector4_u32[0] == V2.vector4_u32[0] && - V1.vector4_u32[1] == V2.vector4_u32[1] && - V1.vector4_u32[2] == V2.vector4_u32[2] && - V1.vector4_u32[3] == V2.vector4_u32[3]) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (V1.vector4_u32[0] != V2.vector4_u32[0] && - V1.vector4_u32[1] != V2.vector4_u32[1] && - V1.vector4_u32[2] != V2.vector4_u32[2] && - V1.vector4_u32[3] != V2.vector4_u32[3]) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_u32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1); - - uint32_t CR = 0; - if ( r == 0xFFFFFFFFU ) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); - int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)); - uint32_t CR = 0; - if (iTest==0xf) // All equal? - { - CR = XM_CRMASK_CR6TRUE; - } - else if (iTest==0) // All not equal? - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#endif -} - -inline bool XM_CALLCONV XMVector4NearEqual -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR Epsilon -) -{ -#if defined(_XM_NO_INTRINSICS_) - float dx, dy, dz, dw; - - dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); - dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); - dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]); - dw = fabsf(V1.vector4_f32[3]-V2.vector4_f32[3]); - return (((dx <= Epsilon.vector4_f32[0]) && - (dy <= Epsilon.vector4_f32[1]) && - (dz <= Epsilon.vector4_f32[2]) && - (dw <= Epsilon.vector4_f32[3])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vDelta = vsubq_f32( V1, V2 ); - uint32x4_t vResult = vacleq_f32( vDelta, Epsilon ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - // Get the difference - XMVECTOR vDelta = _mm_sub_ps(V1,V2); - // Get the absolute value of the difference - XMVECTOR vTemp = _mm_setzero_ps(); - vTemp = _mm_sub_ps(vTemp,vDelta); - vTemp = _mm_max_ps(vTemp,vDelta); - vTemp = _mm_cmple_ps(vTemp,Epsilon); - return ((_mm_movemask_ps(vTemp)==0xf) != 0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector4NotEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpneq_ps(V1,V2); - return ((_mm_movemask_ps(vTemp)) != 0); -#else - return XMComparisonAnyFalse(XMVector4EqualR(V1, V2)); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector4NotEqualInt -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vceqq_u32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); - return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))!=0xF) != 0); -#else - return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2)); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector4Greater -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcgtq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); - return ((_mm_movemask_ps(vTemp)==0x0f) != 0); -#else - return XMComparisonAllTrue(XMVector4GreaterR(V1, V2)); -#endif -} - -//------------------------------------------------------------------------------ - -inline uint32_t XM_CALLCONV XMVector4GreaterR -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - uint32_t CR = 0; - if (V1.vector4_f32[0] > V2.vector4_f32[0] && - V1.vector4_f32[1] > V2.vector4_f32[1] && - V1.vector4_f32[2] > V2.vector4_f32[2] && - V1.vector4_f32[3] > V2.vector4_f32[3]) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (V1.vector4_f32[0] <= V2.vector4_f32[0] && - V1.vector4_f32[1] <= V2.vector4_f32[1] && - V1.vector4_f32[2] <= V2.vector4_f32[2] && - V1.vector4_f32[3] <= V2.vector4_f32[3]) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcgtq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1); - - uint32_t CR = 0; - if ( r == 0xFFFFFFFFU ) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_SSE_INTRINSICS_) - uint32_t CR = 0; - XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); - int iTest = _mm_movemask_ps(vTemp); - if (iTest==0xf) { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTest) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector4GreaterOrEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcgeq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); - return ((_mm_movemask_ps(vTemp)==0x0f) != 0); -#else - return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2)); -#endif -} - -//------------------------------------------------------------------------------ - -inline uint32_t XM_CALLCONV XMVector4GreaterOrEqualR -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - uint32_t CR = 0; - if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && - (V1.vector4_f32[1] >= V2.vector4_f32[1]) && - (V1.vector4_f32[2] >= V2.vector4_f32[2]) && - (V1.vector4_f32[3] >= V2.vector4_f32[3])) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && - (V1.vector4_f32[1] < V2.vector4_f32[1]) && - (V1.vector4_f32[2] < V2.vector4_f32[2]) && - (V1.vector4_f32[3] < V2.vector4_f32[3])) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcgeq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - uint32_t r = vget_lane_u32(vTemp.val[1], 1); - - uint32_t CR = 0; - if ( r == 0xFFFFFFFFU ) - { - CR = XM_CRMASK_CR6TRUE; - } - else if ( !r ) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#elif defined(_XM_SSE_INTRINSICS_) - uint32_t CR = 0; - XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); - int iTest = _mm_movemask_ps(vTemp); - if (iTest==0x0f) - { - CR = XM_CRMASK_CR6TRUE; - } - else if (!iTest) - { - CR = XM_CRMASK_CR6FALSE; - } - return CR; -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector4Less -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcltq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); - return ((_mm_movemask_ps(vTemp)==0x0f) != 0); -#else - return XMComparisonAllTrue(XMVector4GreaterR(V2, V1)); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector4LessOrEqual -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vResult = vcleq_f32( V1, V2 ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp = _mm_cmple_ps(V1,V2); - return ((_mm_movemask_ps(vTemp)==0x0f) != 0); -#else - return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1)); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector4InBounds -( - FXMVECTOR V, - FXMVECTOR Bounds -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && - (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && - (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) && - (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Test if less than or equal - uint32x4_t ivTemp1 = vcleq_f32(V,Bounds); - // Negate the bounds - float32x4_t vTemp2 = vnegq_f32(Bounds); - // Test if greater or equal (Reversed) - uint32x4_t ivTemp2 = vcleq_f32(vTemp2,V); - // Blend answers - ivTemp1 = vandq_u32(ivTemp1,ivTemp2); - // in bounds? - int8x8x2_t vTemp = vzip_u8(vget_low_u8(ivTemp1), vget_high_u8(ivTemp1)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - // Test if less than or equal - XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); - // Negate the bounds - XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); - // Test if greater or equal (Reversed) - vTemp2 = _mm_cmple_ps(vTemp2,V); - // Blend answers - vTemp1 = _mm_and_ps(vTemp1,vTemp2); - // All in bounds? - return ((_mm_movemask_ps(vTemp1)==0x0f) != 0); -#else - return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds)); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector4IsNaN -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - return (XMISNAN(V.vector4_f32[0]) || - XMISNAN(V.vector4_f32[1]) || - XMISNAN(V.vector4_f32[2]) || - XMISNAN(V.vector4_f32[3])); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Test against itself. NaN is always not equal - uint32x4_t vTempNan = vceqq_f32( V, V ); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - // If any are NaN, the mask is zero - return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); -#elif defined(_XM_SSE_INTRINSICS_) - // Test against itself. NaN is always not equal - XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); - // If any are NaN, the mask is non-zero - return (_mm_movemask_ps(vTempNan)!=0); -#endif -} - -//------------------------------------------------------------------------------ - -inline bool XM_CALLCONV XMVector4IsInfinite -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - return (XMISINF(V.vector4_f32[0]) || - XMISINF(V.vector4_f32[1]) || - XMISINF(V.vector4_f32[2]) || - XMISINF(V.vector4_f32[3])); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Mask off the sign bit - uint32x4_t vTempInf = vandq_u32( V, g_XMAbsMask ); - // Compare to infinity - vTempInf = vceqq_f32(vTempInf, g_XMInfinity ); - // If any are infinity, the signs are true. - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - return ( vget_lane_u32(vTemp.val[1], 1) != 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - // Mask off the sign bit - XMVECTOR vTemp = _mm_and_ps(V,g_XMAbsMask); - // Compare to infinity - vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); - // If any are infinity, the signs are true. - return (_mm_movemask_ps(vTemp) != 0); -#endif -} - -//------------------------------------------------------------------------------ -// Computation operations -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4Dot -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = - Result.vector4_f32[1] = - Result.vector4_f32[2] = - Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vTemp = vmulq_f32( V1, V2 ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vadd_f32( v1, v2 ); - v1 = vpadd_f32( v1, v1 ); - return vcombine_f32( v1, v1 ); -#elif defined(_XM_SSE4_INTRINSICS_) - return _mm_dp_ps( V1, V2, 0xff ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vTemp = _mm_mul_ps(V1, V2); - vTemp = _mm_hadd_ps(vTemp, vTemp); - return _mm_hadd_ps(vTemp, vTemp); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vTemp2 = V2; - XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2); - vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position - vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W; - vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position - vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together - return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4Cross -( - FXMVECTOR V1, - FXMVECTOR V2, - FXMVECTOR V3 -) -{ - // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w), - // ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w), - // ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w), - // ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ] - -#if defined(_XM_NO_INTRINSICS_) - XMVECTOR Result; - - Result.vector4_f32[0] = (((V2.vector4_f32[2]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[2]))*V1.vector4_f32[1])-(((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[2])+(((V2.vector4_f32[1]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[1]))*V1.vector4_f32[3]); - Result.vector4_f32[1] = (((V2.vector4_f32[3]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[3]))*V1.vector4_f32[0])-(((V2.vector4_f32[3]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[3]))*V1.vector4_f32[2])+(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[3]); - Result.vector4_f32[2] = (((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[0])-(((V2.vector4_f32[0]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[0]))*V1.vector4_f32[1])+(((V2.vector4_f32[0]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[0]))*V1.vector4_f32[3]); - Result.vector4_f32[3] = (((V2.vector4_f32[2]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[2]))*V1.vector4_f32[0])-(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[1])+(((V2.vector4_f32[1]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[1]))*V1.vector4_f32[2]); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - const float32x2_t select = vget_low_f32( g_XMMaskX ); - - // Term1: V2zwyz * V3wzwy - const float32x2_t v2xy = vget_low_f32(V2); - const float32x2_t v2zw = vget_high_f32(V2); - const float32x2_t v2yx = vrev64_f32(v2xy); - const float32x2_t v2wz = vrev64_f32(v2zw); - const float32x2_t v2yz = vbsl_f32( select, v2yx, v2wz ); - - const float32x2_t v3zw = vget_high_f32(V3); - const float32x2_t v3wz = vrev64_f32(v3zw); - const float32x2_t v3xy = vget_low_f32(V3); - const float32x2_t v3wy = vbsl_f32( select, v3wz, v3xy ); - - float32x4_t vTemp1 = vcombine_f32(v2zw,v2yz); - float32x4_t vTemp2 = vcombine_f32(v3wz,v3wy); - XMVECTOR vResult = vmulq_f32( vTemp1, vTemp2 ); - - // - V2wzwy * V3zwyz - const float32x2_t v2wy = vbsl_f32( select, v2wz, v2xy ); - - const float32x2_t v3yx = vrev64_f32(v3xy); - const float32x2_t v3yz = vbsl_f32( select, v3yx, v3wz ); - - vTemp1 = vcombine_f32(v2wz,v2wy); - vTemp2 = vcombine_f32(v3zw,v3yz); - vResult = vmlsq_f32( vResult, vTemp1, vTemp2 ); - - // term1 * V1yxxx - const float32x2_t v1xy = vget_low_f32(V1); - const float32x2_t v1yx = vrev64_f32(v1xy); - - vTemp1 = vcombine_f32( v1yx, vdup_lane_f32( v1yx, 1 ) ); - vResult = vmulq_f32( vResult, vTemp1 ); - - // Term2: V2ywxz * V3wxwx - const float32x2_t v2yw = vrev64_f32(v2wy); - const float32x2_t v2xz = vbsl_f32( select, v2xy, v2wz ); - - const float32x2_t v3wx = vbsl_f32( select, v3wz, v3yx ); - - vTemp1 = vcombine_f32(v2yw,v2xz); - vTemp2 = vcombine_f32(v3wx,v3wx); - float32x4_t vTerm = vmulq_f32( vTemp1, vTemp2 ); - - // - V2wxwx * V3ywxz - const float32x2_t v2wx = vbsl_f32( select, v2wz, v2yx ); - - const float32x2_t v3yw = vrev64_f32(v3wy); - const float32x2_t v3xz = vbsl_f32( select, v3xy, v3wz ); - - vTemp1 = vcombine_f32(v2wx,v2wx); - vTemp2 = vcombine_f32(v3yw,v3xz); - vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 ); - - // vResult - term2 * V1zzyy - const float32x2_t v1zw = vget_high_f32(V1); - - vTemp1 = vcombine_f32( vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0) ); - vResult = vmlsq_f32( vResult, vTerm, vTemp1 ); - - // Term3: V2yzxy * V3zxyx - const float32x2_t v3zx = vrev64_f32(v3xz); - - vTemp1 = vcombine_f32(v2yz,v2xy); - vTemp2 = vcombine_f32(v3zx,v3yx); - vTerm = vmulq_f32( vTemp1, vTemp2 ); - - // - V2zxyx * V3yzxy - const float32x2_t v2zx = vrev64_f32(v2xz); - - vTemp1 = vcombine_f32(v2zx,v2yx); - vTemp2 = vcombine_f32(v3yz,v3xy); - vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 ); - - // vResult + term3 * V1wwwz - const float32x2_t v1wz = vrev64_f32(v1zw); - - vTemp1 = vcombine_f32( vdup_lane_f32( v1wz, 0 ), v1wz ); - return vmlaq_f32( vResult, vTerm, vTemp1 ); -#elif defined(_XM_SSE_INTRINSICS_) - // V2zwyz * V3wzwy - XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,1,3,2)); - XMVECTOR vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,3,2,3)); - vResult = _mm_mul_ps(vResult,vTemp3); - // - V2wzwy * V3zwyz - XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,3,2,3)); - vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(1,3,0,1)); - vTemp2 = _mm_mul_ps(vTemp2,vTemp3); - vResult = _mm_sub_ps(vResult,vTemp2); - // term1 * V1yxxx - XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,0,0,1)); - vResult = _mm_mul_ps(vResult,vTemp1); - - // V2ywxz * V3wxwx - vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,3,1)); - vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,3,0,3)); - vTemp3 = _mm_mul_ps(vTemp3,vTemp2); - // - V2wxwx * V3ywxz - vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,1,2,1)); - vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(2,0,3,1)); - vTemp2 = _mm_mul_ps(vTemp2,vTemp1); - vTemp3 = _mm_sub_ps(vTemp3,vTemp2); - // vResult - temp * V1zzyy - vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(1,1,2,2)); - vTemp1 = _mm_mul_ps(vTemp1,vTemp3); - vResult = _mm_sub_ps(vResult,vTemp1); - - // V2yzxy * V3zxyx - vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,0,2,1)); - vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,1,0,2)); - vTemp3 = _mm_mul_ps(vTemp3,vTemp2); - // - V2zxyx * V3yzxy - vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,0,2,1)); - vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,0,2,1)); - vTemp1 = _mm_mul_ps(vTemp1,vTemp2); - vTemp3 = _mm_sub_ps(vTemp3,vTemp1); - // vResult + term * V1wwwz - vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,3,3,3)); - vTemp3 = _mm_mul_ps(vTemp3,vTemp1); - vResult = _mm_add_ps(vResult,vTemp3); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4LengthSq -( - FXMVECTOR V -) -{ - return XMVector4Dot(V, V); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - - Result = XMVector4LengthSq(V); - Result = XMVectorReciprocalSqrtEst(Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Dot4 - float32x4_t vTemp = vmulq_f32( V, V ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vadd_f32( v1, v2 ); - v1 = vpadd_f32( v1, v1 ); - // Reciprocal sqrt (estimate) - v2 = vrsqrte_f32( v1 ); - return vcombine_f32(v2, v2); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); - return _mm_rsqrt_ps( vTemp ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_rsqrt_ps(vLengthSq); - return vLengthSq; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x,y,z and w - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has z and w - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); - // x+z, y+w - vLengthSq = _mm_add_ps(vLengthSq,vTemp); - // x+z,x+z,x+z,y+w - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); - // ??,??,y+w,y+w - vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); - // ??,??,x+z+y+w,?? - vLengthSq = _mm_add_ps(vLengthSq,vTemp); - // Splat the length - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); - // Get the reciprocal - vLengthSq = _mm_rsqrt_ps(vLengthSq); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - - Result = XMVector4LengthSq(V); - Result = XMVectorReciprocalSqrt(Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Dot4 - float32x4_t vTemp = vmulq_f32( V, V ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vadd_f32( v1, v2 ); - v1 = vpadd_f32( v1, v1 ); - // Reciprocal sqrt - float32x2_t S0 = vrsqrte_f32(v1); - float32x2_t P0 = vmul_f32( v1, S0 ); - float32x2_t R0 = vrsqrts_f32( P0, S0 ); - float32x2_t S1 = vmul_f32( S0, R0 ); - float32x2_t P1 = vmul_f32( v1, S1 ); - float32x2_t R1 = vrsqrts_f32( P1, S1 ); - float32x2_t Result = vmul_f32( S1, R1 ); - return vcombine_f32( Result, Result ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); - XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); - return _mm_div_ps( g_XMOne, vLengthSq ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_sqrt_ps(vLengthSq); - vLengthSq = _mm_div_ps(g_XMOne, vLengthSq); - return vLengthSq; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x,y,z and w - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has z and w - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); - // x+z, y+w - vLengthSq = _mm_add_ps(vLengthSq,vTemp); - // x+z,x+z,x+z,y+w - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); - // ??,??,y+w,y+w - vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); - // ??,??,x+z+y+w,?? - vLengthSq = _mm_add_ps(vLengthSq,vTemp); - // Splat the length - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); - // Get the reciprocal - vLengthSq = _mm_sqrt_ps(vLengthSq); - // Accurate! - vLengthSq = _mm_div_ps(g_XMOne,vLengthSq); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4LengthEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - - Result = XMVector4LengthSq(V); - Result = XMVectorSqrtEst(Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Dot4 - float32x4_t vTemp = vmulq_f32( V, V ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vadd_f32( v1, v2 ); - v1 = vpadd_f32( v1, v1 ); - const float32x2_t zero = vdup_n_f32(0); - uint32x2_t VEqualsZero = vceq_f32( v1, zero ); - // Sqrt (estimate) - float32x2_t Result = vrsqrte_f32( v1 ); - Result = vmul_f32( v1, Result ); - Result = vbsl_f32( VEqualsZero, zero, Result ); - return vcombine_f32( Result, Result ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); - return _mm_sqrt_ps( vTemp ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_sqrt_ps(vLengthSq); - return vLengthSq; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x,y,z and w - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has z and w - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); - // x+z, y+w - vLengthSq = _mm_add_ps(vLengthSq,vTemp); - // x+z,x+z,x+z,y+w - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); - // ??,??,y+w,y+w - vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); - // ??,??,x+z+y+w,?? - vLengthSq = _mm_add_ps(vLengthSq,vTemp); - // Splat the length - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); - // Get the length - vLengthSq = _mm_sqrt_ps(vLengthSq); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4Length -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - - Result = XMVector4LengthSq(V); - Result = XMVectorSqrt(Result); - - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Dot4 - float32x4_t vTemp = vmulq_f32( V, V ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vadd_f32( v1, v2 ); - v1 = vpadd_f32( v1, v1 ); - const float32x2_t zero = vdup_n_f32(0); - uint32x2_t VEqualsZero = vceq_f32( v1, zero ); - // Sqrt - float32x2_t S0 = vrsqrte_f32( v1 ); - float32x2_t P0 = vmul_f32( v1, S0 ); - float32x2_t R0 = vrsqrts_f32( P0, S0 ); - float32x2_t S1 = vmul_f32( S0, R0 ); - float32x2_t P1 = vmul_f32( v1, S1 ); - float32x2_t R1 = vrsqrts_f32( P1, S1 ); - float32x2_t Result = vmul_f32( S1, R1 ); - Result = vmul_f32( v1, Result ); - Result = vbsl_f32( VEqualsZero, zero, Result ); - return vcombine_f32( Result, Result ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); - return _mm_sqrt_ps( vTemp ); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_sqrt_ps(vLengthSq); - return vLengthSq; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x,y,z and w - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has z and w - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); - // x+z, y+w - vLengthSq = _mm_add_ps(vLengthSq,vTemp); - // x+z,x+z,x+z,y+w - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); - // ??,??,y+w,y+w - vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); - // ??,??,x+z+y+w,?? - vLengthSq = _mm_add_ps(vLengthSq,vTemp); - // Splat the length - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); - // Get the length - vLengthSq = _mm_sqrt_ps(vLengthSq); - return vLengthSq; -#endif -} - -//------------------------------------------------------------------------------ -// XMVector4NormalizeEst uses a reciprocal estimate and -// returns QNaN on zero and infinite vectors. - -inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result = XMVector4ReciprocalLength(V); - Result = XMVectorMultiply(V, Result); - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Dot4 - float32x4_t vTemp = vmulq_f32( V, V ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vadd_f32( v1, v2 ); - v1 = vpadd_f32( v1, v1 ); - // Reciprocal sqrt (estimate) - v2 = vrsqrte_f32( v1 ); - // Normalize - return vmulq_f32( V, vcombine_f32(v2,v2) ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); - XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); - return _mm_mul_ps(vResult, V); -#elif defined(_XM_SSE3_INTRINSICS_) - XMVECTOR vDot = _mm_mul_ps(V, V); - vDot = _mm_hadd_ps(vDot, vDot); - vDot = _mm_hadd_ps(vDot, vDot); - vDot = _mm_rsqrt_ps(vDot); - vDot = _mm_mul_ps(vDot, V); - return vDot; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x,y,z and w - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has z and w - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); - // x+z, y+w - vLengthSq = _mm_add_ps(vLengthSq,vTemp); - // x+z,x+z,x+z,y+w - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); - // ??,??,y+w,y+w - vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); - // ??,??,x+z+y+w,?? - vLengthSq = _mm_add_ps(vLengthSq,vTemp); - // Splat the length - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); - // Get the reciprocal - XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq); - // Reciprocal mul to perform the normalization - vResult = _mm_mul_ps(vResult,V); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4Normalize -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - float fLength; - XMVECTOR vResult; - - vResult = XMVector4Length( V ); - fLength = vResult.vector4_f32[0]; - - // Prevent divide by zero - if (fLength > 0) { - fLength = 1.0f/fLength; - } - - vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; - vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; - vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; - vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; - return vResult; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - // Dot4 - float32x4_t vTemp = vmulq_f32( V, V ); - float32x2_t v1 = vget_low_f32( vTemp ); - float32x2_t v2 = vget_high_f32( vTemp ); - v1 = vadd_f32( v1, v2 ); - v1 = vpadd_f32( v1, v1 ); - uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) ); - uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) ); - // Reciprocal sqrt (2 iterations of Newton-Raphson) - float32x2_t S0 = vrsqrte_f32( v1 ); - float32x2_t P0 = vmul_f32( v1, S0 ); - float32x2_t R0 = vrsqrts_f32( P0, S0 ); - float32x2_t S1 = vmul_f32( S0, R0 ); - float32x2_t P1 = vmul_f32( v1, S1 ); - float32x2_t R1 = vrsqrts_f32( P1, S1 ); - v2 = vmul_f32( S1, R1 ); - // Normalize - XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) ); - vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult ); - return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult ); -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff ); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Create zero with a single instruction - XMVECTOR vZeroMask = _mm_setzero_ps(); - // Test for a divide by zero (Must be FP to detect -0.0) - vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Divide to perform the normalization - vResult = _mm_div_ps(V,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vZeroMask); - // Select qnan or result based on infinite length - XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); - XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); - vResult = _mm_or_ps(vTemp1,vTemp2); - return vResult; -#elif defined(_XM_SSE3_INTRINSICS_) - // Perform the dot product on x,y,z and w - XMVECTOR vLengthSq = _mm_mul_ps(V, V); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Create zero with a single instruction - XMVECTOR vZeroMask = _mm_setzero_ps(); - // Test for a divide by zero (Must be FP to detect -0.0) - vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Divide to perform the normalization - vResult = _mm_div_ps(V,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vZeroMask); - // Select qnan or result based on infinite length - XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); - XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); - vResult = _mm_or_ps(vTemp1,vTemp2); - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - // Perform the dot product on x,y,z and w - XMVECTOR vLengthSq = _mm_mul_ps(V,V); - // vTemp has z and w - XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); - // x+z, y+w - vLengthSq = _mm_add_ps(vLengthSq,vTemp); - // x+z,x+z,x+z,y+w - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); - // ??,??,y+w,y+w - vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); - // ??,??,x+z+y+w,?? - vLengthSq = _mm_add_ps(vLengthSq,vTemp); - // Splat the length - vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); - // Prepare for the division - XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); - // Create zero with a single instruction - XMVECTOR vZeroMask = _mm_setzero_ps(); - // Test for a divide by zero (Must be FP to detect -0.0) - vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); - // Failsafe on zero (Or epsilon) length planes - // If the length is infinity, set the elements to zero - vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); - // Divide to perform the normalization - vResult = _mm_div_ps(V,vResult); - // Any that are infinity, set to zero - vResult = _mm_and_ps(vResult,vZeroMask); - // Select qnan or result based on infinite length - XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); - XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); - vResult = _mm_or_ps(vTemp1,vTemp2); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4ClampLength -( - FXMVECTOR V, - float LengthMin, - float LengthMax -) -{ - XMVECTOR ClampMax = XMVectorReplicate(LengthMax); - XMVECTOR ClampMin = XMVectorReplicate(LengthMin); - - return XMVector4ClampLengthV(V, ClampMin, ClampMax); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4ClampLengthV -( - FXMVECTOR V, - FXMVECTOR LengthMin, - FXMVECTOR LengthMax -) -{ - assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin))); - assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax))); - assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero())); - assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero())); - assert(XMVector4GreaterOrEqual(LengthMax, LengthMin)); - - XMVECTOR LengthSq = XMVector4LengthSq(V); - - const XMVECTOR Zero = XMVectorZero(); - - XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); - - XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); - XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); - - XMVECTOR Normal = XMVectorMultiply(V, RcpLength); - - XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); - - XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); - Length = XMVectorSelect(LengthSq, Length, Select); - Normal = XMVectorSelect(LengthSq, Normal, Select); - - XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); - XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); - - XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); - ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); - - XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); - - // Preserve the original vector (with no precision loss) if the length falls within the given range - XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); - Result = XMVectorSelect(Result, V, Control); - - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4Reflect -( - FXMVECTOR Incident, - FXMVECTOR Normal -) -{ - // Result = Incident - (2 * dot(Incident, Normal)) * Normal - - XMVECTOR Result = XMVector4Dot(Incident, Normal); - Result = XMVectorAdd(Result, Result); - Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); - - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4Refract -( - FXMVECTOR Incident, - FXMVECTOR Normal, - float RefractionIndex -) -{ - XMVECTOR Index = XMVectorReplicate(RefractionIndex); - return XMVector4RefractV(Incident, Normal, Index); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4RefractV -( - FXMVECTOR Incident, - FXMVECTOR Normal, - FXMVECTOR RefractionIndex -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR IDotN; - XMVECTOR R; - const XMVECTOR Zero = XMVectorZero(); - - // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + - // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) - - IDotN = XMVector4Dot(Incident, Normal); - - // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) - R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); - R = XMVectorMultiply(R, RefractionIndex); - R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); - - if (XMVector4LessOrEqual(R, Zero)) - { - // Total internal reflection - return Zero; - } - else - { - XMVECTOR Result; - - // R = RefractionIndex * IDotN + sqrt(R) - R = XMVectorSqrt(R); - R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); - - // Result = RefractionIndex * Incident - Normal * R - Result = XMVectorMultiply(RefractionIndex, Incident); - Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); - - return Result; - } - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - XMVECTOR IDotN = XMVector4Dot(Incident,Normal); - - // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) - float32x4_t R = vmlsq_f32( g_XMOne, IDotN, IDotN); - R = vmulq_f32(R, RefractionIndex); - R = vmlsq_f32(g_XMOne, R, RefractionIndex ); - - uint32x4_t vResult = vcleq_f32(R,g_XMZero); - int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); - vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); - if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ) - { - // Total internal reflection - vResult = g_XMZero; - } - else - { - // Sqrt(R) - float32x4_t S0 = vrsqrteq_f32(R); - float32x4_t P0 = vmulq_f32( R, S0 ); - float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); - float32x4_t S1 = vmulq_f32( S0, R0 ); - float32x4_t P1 = vmulq_f32( R, S1 ); - float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); - float32x4_t S2 = vmulq_f32( S1, R1 ); - R = vmulq_f32( R, S2 ); - // R = RefractionIndex * IDotN + sqrt(R) - R = vmlaq_f32( R, RefractionIndex, IDotN ); - // Result = RefractionIndex * Incident - Normal * R - vResult = vmulq_f32(RefractionIndex, Incident); - vResult = vmlsq_f32( vResult, R, Normal ); - } - return vResult; -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR IDotN = XMVector4Dot(Incident,Normal); - - // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) - XMVECTOR R = _mm_mul_ps(IDotN,IDotN); - R = _mm_sub_ps(g_XMOne,R); - R = _mm_mul_ps(R, RefractionIndex); - R = _mm_mul_ps(R, RefractionIndex); - R = _mm_sub_ps(g_XMOne,R); - - XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero); - if (_mm_movemask_ps(vResult)==0x0f) - { - // Total internal reflection - vResult = g_XMZero; - } - else - { - // R = RefractionIndex * IDotN + sqrt(R) - R = _mm_sqrt_ps(R); - vResult = _mm_mul_ps(RefractionIndex, IDotN); - R = _mm_add_ps(R,vResult); - // Result = RefractionIndex * Incident - Normal * R - vResult = _mm_mul_ps(RefractionIndex, Incident); - R = _mm_mul_ps(R,Normal); - vResult = _mm_sub_ps(vResult,R); - } - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4Orthogonal -( - FXMVECTOR V -) -{ -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR Result; - Result.vector4_f32[0] = V.vector4_f32[2]; - Result.vector4_f32[1] = V.vector4_f32[3]; - Result.vector4_f32[2] = -V.vector4_f32[0]; - Result.vector4_f32[3] = -V.vector4_f32[1]; - return Result; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 Negate = { 1.f, 1.f, -1.f, -1.f }; - - float32x4_t Result = vcombine_f32( vget_high_f32( V ), vget_low_f32( V ) ); - return vmulq_f32( Result, Negate ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 FlipZW = {1.0f,1.0f,-1.0f,-1.0f}; - XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,0,3,2)); - vResult = _mm_mul_ps(vResult,FlipZW); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst -( - FXMVECTOR N1, - FXMVECTOR N2 -) -{ - XMVECTOR Result = XMVector4Dot(N1, N2); - Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); - Result = XMVectorACosEst(Result); - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals -( - FXMVECTOR N1, - FXMVECTOR N2 -) -{ - XMVECTOR Result = XMVector4Dot(N1, N2); - Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); - Result = XMVectorACos(Result); - return Result; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - XMVECTOR L1 = XMVector4ReciprocalLength(V1); - XMVECTOR L2 = XMVector4ReciprocalLength(V2); - - XMVECTOR Dot = XMVector4Dot(V1, V2); - - L1 = XMVectorMultiply(L1, L2); - - XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); - CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); - - return XMVectorACos(CosAngle); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV XMVector4Transform -( - FXMVECTOR V, - FXMMATRIX M -) -{ -#if defined(_XM_NO_INTRINSICS_) - - float fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]); - float fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]); - float fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]); - float fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]); - XMVECTOR vResult; - vResult.vector4_f32[0] = fX; - vResult.vector4_f32[1] = fY; - vResult.vector4_f32[2] = fZ; - vResult.vector4_f32[3] = fW; - return vResult; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x2_t VL = vget_low_f32( V ); - XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X - vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y - float32x2_t VH = vget_high_f32( V ); - vResult = vmlaq_lane_f32( vResult, M.r[2], VH, 0 ); // Z - return vmlaq_lane_f32( vResult, M.r[3], VH, 1 ); // W -#elif defined(_XM_SSE_INTRINSICS_) - // Splat x,y,z and w - XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); - // Mul by the matrix - vTempX = _mm_mul_ps(vTempX,M.r[0]); - vTempY = _mm_mul_ps(vTempY,M.r[1]); - vTempZ = _mm_mul_ps(vTempZ,M.r[2]); - vTempW = _mm_mul_ps(vTempW,M.r[3]); - // Add them all together - vTempX = _mm_add_ps(vTempX,vTempY); - vTempZ = _mm_add_ps(vTempZ,vTempW); - vTempX = _mm_add_ps(vTempX,vTempZ); - return vTempX; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream -( - XMFLOAT4* pOutputStream, - size_t OutputStride, - const XMFLOAT4* pInputStream, - size_t InputStride, - size_t VectorCount, - FXMMATRIX M -) -{ - assert(pOutputStream != nullptr); - assert(pInputStream != nullptr); - - assert(InputStride >= sizeof(XMFLOAT4)); - _Analysis_assume_(InputStride >= sizeof(XMFLOAT4)); - - assert(OutputStride >= sizeof(XMFLOAT4)); - _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); - -#if defined(_XM_NO_INTRINSICS_) - - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row2 = M.r[2]; - const XMVECTOR row3 = M.r[3]; - - for (size_t i = 0; i < VectorCount; i++) - { - XMVECTOR V = XMLoadFloat4((const XMFLOAT4*)pInputVector); - XMVECTOR W = XMVectorSplatW(V); - XMVECTOR Z = XMVectorSplatZ(V); - XMVECTOR Y = XMVectorSplatY(V); - XMVECTOR X = XMVectorSplatX(V); - - XMVECTOR Result = XMVectorMultiply(W, row3); - Result = XMVectorMultiplyAdd(Z, row2, Result); - Result = XMVectorMultiplyAdd(Y, row1, Result); - Result = XMVectorMultiplyAdd(X, row0, Result); - - #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) - XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); - - pInputVector += InputStride; - pOutputVector += OutputStride; - } - - return pOutputStream; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row2 = M.r[2]; - const XMVECTOR row3 = M.r[3]; - - size_t i = 0; - size_t four = VectorCount >> 2; - if ( four > 0 ) - { - if ((InputStride == sizeof(XMFLOAT4)) && (OutputStride == sizeof(XMFLOAT4))) - { - for (size_t j = 0; j < four; ++j) - { - float32x4x4_t V = vld4q_f32( reinterpret_cast(pInputVector) ); - pInputVector += sizeof(XMFLOAT4)*4; - - float32x2_t r = vget_low_f32( row0 ); - XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax - XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx - - __prefetch( pInputVector ); - - r = vget_high_f32( row0 ); - XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx - XMVECTOR vResult3 = vmulq_lane_f32( V.val[0], r, 1 ); // Dx - - __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); - - r = vget_low_f32( row1 ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey - vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); - - r = vget_high_f32( row1 ); - vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy - vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); - - r = vget_low_f32( row2 ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz - vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); - - r = vget_high_f32( row2 ); - vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz - vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); - - r = vget_low_f32( row3 ); - vResult0 = vmlaq_lane_f32( vResult0, V.val[3], r, 0 ); // Ax+Ey+Iz+Mw - vResult1 = vmlaq_lane_f32( vResult1, V.val[3], r, 1 ); // Bx+Fy+Jz+Nw - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*6) ); - - r = vget_high_f32( row3 ); - vResult2 = vmlaq_lane_f32( vResult2, V.val[3], r, 0 ); // Cx+Gy+Kz+Ow - vResult3 = vmlaq_lane_f32( vResult3, V.val[3], r, 1 ); // Dx+Hy+Lz+Pw - - __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*7) ); - - V.val[0] = vResult0; - V.val[1] = vResult1; - V.val[2] = vResult2; - V.val[3] = vResult3; - - vst4q_f32( reinterpret_cast(pOutputVector), V ); - pOutputVector += sizeof(XMFLOAT4)*4; - - i += 4; - } - } - } - - for (; i < VectorCount; i++) - { - XMVECTOR V = vld1q_f32( reinterpret_cast(pInputVector) ); - pInputVector += InputStride; - - float32x2_t VL = vget_low_f32( V ); - XMVECTOR vResult = vmulq_lane_f32( row0, VL, 0 ); // X - vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y - float32x2_t VH = vget_high_f32( V ); - vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z - vResult = vmlaq_lane_f32( vResult, row3, VH, 1 ); // W - - vst1q_f32( reinterpret_cast(pOutputVector), vResult ); - pOutputVector += OutputStride; - } - - return pOutputStream; -#elif defined(_XM_SSE_INTRINSICS_) - const uint8_t* pInputVector = (const uint8_t*)pInputStream; - uint8_t* pOutputVector = (uint8_t*)pOutputStream; - - const XMVECTOR row0 = M.r[0]; - const XMVECTOR row1 = M.r[1]; - const XMVECTOR row2 = M.r[2]; - const XMVECTOR row3 = M.r[3]; - - if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) ) - { - if ( !((uintptr_t)pInputStream & 0xF) && !(InputStride & 0xF) ) - { - // Aligned input, aligned output - for (size_t i = 0; i < VectorCount; i++) - { - __m128 V = _mm_load_ps( reinterpret_cast(pInputVector) ); - pInputVector += InputStride; - - XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); - - vTempX = _mm_mul_ps(vTempX,row0); - vTempY = _mm_mul_ps(vTempY,row1); - vTempZ = _mm_mul_ps(vTempZ,row2); - vTempW = _mm_mul_ps(vTempW,row3); - - vTempX = _mm_add_ps(vTempX,vTempY); - vTempZ = _mm_add_ps(vTempZ,vTempW); - vTempX = _mm_add_ps(vTempX,vTempZ); - - XM_STREAM_PS( reinterpret_cast(pOutputVector), vTempX ); - pOutputVector += OutputStride; - } - } - else - { - // Unaligned input, aligned output - for (size_t i = 0; i < VectorCount; i++) - { - __m128 V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - pInputVector += InputStride; - - XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); - - vTempX = _mm_mul_ps(vTempX,row0); - vTempY = _mm_mul_ps(vTempY,row1); - vTempZ = _mm_mul_ps(vTempZ,row2); - vTempW = _mm_mul_ps(vTempW,row3); - - vTempX = _mm_add_ps(vTempX,vTempY); - vTempZ = _mm_add_ps(vTempZ,vTempW); - vTempX = _mm_add_ps(vTempX,vTempZ); - - XM_STREAM_PS( reinterpret_cast(pOutputVector), vTempX ); - pOutputVector += OutputStride; - } - } - } - else - { - if ( !((uintptr_t)pInputStream & 0xF) && !(InputStride & 0xF) ) - { - // Aligned input, unaligned output - for (size_t i = 0; i < VectorCount; i++) - { - __m128 V = _mm_load_ps( reinterpret_cast(pInputVector) ); - pInputVector += InputStride; - - XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); - - vTempX = _mm_mul_ps(vTempX,row0); - vTempY = _mm_mul_ps(vTempY,row1); - vTempZ = _mm_mul_ps(vTempZ,row2); - vTempW = _mm_mul_ps(vTempW,row3); - - vTempX = _mm_add_ps(vTempX,vTempY); - vTempZ = _mm_add_ps(vTempZ,vTempW); - vTempX = _mm_add_ps(vTempX,vTempZ); - - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTempX ); - pOutputVector += OutputStride; - } - } - else - { - // Unaligned input, unaligned output - for (size_t i = 0; i < VectorCount; i++) - { - __m128 V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); - pInputVector += InputStride; - - XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); - XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); - XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); - XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); - - vTempX = _mm_mul_ps(vTempX,row0); - vTempY = _mm_mul_ps(vTempY,row1); - vTempZ = _mm_mul_ps(vTempZ,row2); - vTempW = _mm_mul_ps(vTempW,row3); - - vTempX = _mm_add_ps(vTempX,vTempY); - vTempZ = _mm_add_ps(vTempZ,vTempW); - vTempX = _mm_add_ps(vTempX,vTempZ); - - _mm_storeu_ps( reinterpret_cast(pOutputVector), vTempX ); - pOutputVector += OutputStride; - } - } - } - - XM_SFENCE(); - - return pOutputStream; -#endif -} - -/**************************************************************************** - * - * XMVECTOR operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V) -{ - return V; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV operator- (FXMVECTOR V) -{ - return XMVectorNegate(V); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR& XM_CALLCONV operator+= -( - XMVECTOR& V1, - FXMVECTOR V2 -) -{ - V1 = XMVectorAdd(V1, V2); - return V1; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR& XM_CALLCONV operator-= -( - XMVECTOR& V1, - FXMVECTOR V2 -) -{ - V1 = XMVectorSubtract(V1, V2); - return V1; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR& XM_CALLCONV operator*= -( - XMVECTOR& V1, - FXMVECTOR V2 -) -{ - V1 = XMVectorMultiply(V1, V2); - return V1; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR& XM_CALLCONV operator/= -( - XMVECTOR& V1, - FXMVECTOR V2 -) -{ - V1 = XMVectorDivide(V1,V2); - return V1; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR& operator*= -( - XMVECTOR& V, - const float S -) -{ - V = XMVectorScale(V, S); - return V; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR& operator/= -( - XMVECTOR& V, - const float S -) -{ - XMVECTOR vS = XMVectorReplicate( S ); - V = XMVectorDivide(V, vS); - return V; -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV operator+ -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - return XMVectorAdd(V1, V2); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV operator- -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - return XMVectorSubtract(V1, V2); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV operator* -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - return XMVectorMultiply(V1, V2); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV operator/ -( - FXMVECTOR V1, - FXMVECTOR V2 -) -{ - return XMVectorDivide(V1,V2); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV operator* -( - FXMVECTOR V, - const float S -) -{ - return XMVectorScale(V, S); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV operator/ -( - FXMVECTOR V, - const float S -) -{ - XMVECTOR vS = XMVectorReplicate( S ); - return XMVectorDivide(V, vS); -} - -//------------------------------------------------------------------------------ - -inline XMVECTOR XM_CALLCONV operator* -( - float S, - FXMVECTOR V -) -{ - return XMVectorScale(V, S); -} - -#if defined(_XM_NO_INTRINSICS_) -#undef XMISNAN -#undef XMISINF -#endif - -#if defined(_XM_SSE_INTRINSICS_) -#undef XM3UNPACK3INTO4 -#undef XM3PACK4INTO3 -#endif +//------------------------------------------------------------------------------------- +// DirectXMathVector.inl -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#if defined(_XM_NO_INTRINSICS_) +#define XMISNAN(x) ((*(uint32_t*)&(x) & 0x7F800000) == 0x7F800000 && (*(uint32_t*)&(x) & 0x7FFFFF) != 0) +#define XMISINF(x) ((*(uint32_t*)&(x) & 0x7FFFFFFF) == 0x7F800000) +#endif + +#if defined(_XM_SSE_INTRINSICS_) + +#define XM3UNPACK3INTO4(l1,l2,l3) \ + XMVECTOR V3 = _mm_shuffle_ps(l2,l3,_MM_SHUFFLE(0,0,3,2));\ + XMVECTOR V2 = _mm_shuffle_ps(l2,l1,_MM_SHUFFLE(3,3,1,0));\ + V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,1,0,2));\ + XMVECTOR V4 = _mm_castsi128_ps( _mm_srli_si128(_mm_castps_si128(L3),32/8) ); + +#define XM3PACK4INTO3(v2x) \ + v2x = _mm_shuffle_ps(V2,V3,_MM_SHUFFLE(1,0,2,1));\ + V2 = _mm_shuffle_ps(V2,V1,_MM_SHUFFLE(2,2,0,0));\ + V1 = _mm_shuffle_ps(V1,V2,_MM_SHUFFLE(0,2,1,0));\ + V3 = _mm_shuffle_ps(V3,V4,_MM_SHUFFLE(0,0,2,2));\ + V3 = _mm_shuffle_ps(V3,V4,_MM_SHUFFLE(2,1,2,0));\ + +#endif + +/**************************************************************************** + * + * General Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Assignment operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Return a vector with all elements equaling zero +inline XMVECTOR XM_CALLCONV XMVectorZero() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f}; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four floating point values +inline XMVECTOR XM_CALLCONV XMVectorSet +( + float x, + float y, + float z, + float w +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = {x,y,z,w}; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t V0 = vcreate_f32(((uint64_t)*(const uint32_t *)&x) | ((uint64_t)(*(const uint32_t *)&y) << 32)); + float32x2_t V1 = vcreate_f32(((uint64_t)*(const uint32_t *)&z) | ((uint64_t)(*(const uint32_t *)&w) << 32)); + return vcombine_f32(V0, V1); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps( w, z, y, x ); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four integer values +inline XMVECTOR XM_CALLCONV XMVectorSetInt +( + uint32_t x, + uint32_t y, + uint32_t z, + uint32_t w +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = {x,y,z,w}; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t V0 = vcreate_u32(((uint64_t)x) | ((uint64_t)y << 32)); + uint32x2_t V1 = vcreate_u32(((uint64_t)z) | ((uint64_t)w << 32)); + return vcombine_u32(V0, V1); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set_epi32( w, z, y, x ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value +inline XMVECTOR XM_CALLCONV XMVectorReplicate +( + float Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = Value; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32( Value ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps1( Value ); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr +( + const float *pValue +) +{ +#if defined(_XM_NO_INTRINSICS_) + float Value = pValue[0]; + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = Value; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_dup_f32( pValue ); +#elif defined(_XM_AVX_INTRINSICS_) + return _mm_broadcast_ss( pValue ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1( pValue ); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value +inline XMVECTOR XM_CALLCONV XMVectorReplicateInt +( + uint32_t Value +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32( Value ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_set1_epi32( Value ); + return _mm_castsi128_ps(vTemp); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr +( + const uint32_t *pValue +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t Value = pValue[0]; + XMVECTORU32 vResult; + vResult.u[0] = + vResult.u[1] = + vResult.u[2] = + vResult.u[3] = Value; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_dup_u32(pValue); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1(reinterpret_cast(pValue)); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits set (true mask) +inline XMVECTOR XM_CALLCONV XMVectorTrueInt() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = {0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU}; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_s32(-1); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32(-1); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits clear (false mask) +inline XMVECTOR XM_CALLCONV XMVectorFalseInt() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f}; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the x component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatX +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = V.vector4_f32[0]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32( vget_low_f32( V ), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the y component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatY +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = V.vector4_f32[1]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32( vget_low_f32( V ), 1 ); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the z component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatZ +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = V.vector4_f32[2]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32( vget_high_f32( V ), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); +#endif +} + +//------------------------------------------------------------------------------ +// Replicate the w component of the vector +inline XMVECTOR XM_CALLCONV XMVectorSplatW +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = V.vector4_f32[3]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32( vget_high_f32( V ), 1 ); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.0f,1.0f,1.0f,1.0f +inline XMVECTOR XM_CALLCONV XMVectorSplatOne() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = 1.0f; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(1.0f); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMOne; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of INF,INF,INF,INF +inline XMVECTOR XM_CALLCONV XMVectorSplatInfinity() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_u32[0] = + vResult.vector4_u32[1] = + vResult.vector4_u32[2] = + vResult.vector4_u32[3] = 0x7F800000; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0x7F800000); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMInfinity; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN +inline XMVECTOR XM_CALLCONV XMVectorSplatQNaN() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_u32[0] = + vResult.vector4_u32[1] = + vResult.vector4_u32[2] = + vResult.vector4_u32[3] = 0x7FC00000; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0x7FC00000); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMQNaN; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f +inline XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_u32[0] = + vResult.vector4_u32[1] = + vResult.vector4_u32[2] = + vResult.vector4_u32[3] = 0x34000000; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0x34000000); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMEpsilon; +#endif +} + +//------------------------------------------------------------------------------ +// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f +inline XMVECTOR XM_CALLCONV XMVectorSplatSignMask() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_u32[0] = + vResult.vector4_u32[1] = + vResult.vector4_u32[2] = + vResult.vector4_u32[3] = 0x80000000U; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0x80000000U); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32( 0x80000000 ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Return a floating point value via an index. This is not a recommended +// function to use due to performance loss. +inline float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) +{ + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[i]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return V.n128_f32[i]; +#elif defined(_XM_SSE_INTRINSICS_) + return V.m128_f32[i]; +#endif +} + +//------------------------------------------------------------------------------ +// Return the X component in an FPU register. +inline float XM_CALLCONV XMVectorGetX(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cvtss_f32(V); +#endif +} + +// Return the Y component in an FPU register. +inline float XM_CALLCONV XMVectorGetY(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + return _mm_cvtss_f32(vTemp); +#endif +} + +// Return the Z component in an FPU register. +inline float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + return _mm_cvtss_f32(vTemp); +#endif +} + +// Return the W component in an FPU register. +inline float XM_CALLCONV XMVectorGetW(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + return _mm_cvtss_f32(vTemp); +#endif +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetByIndexPtr(float *f, FXMVECTOR V, size_t i) +{ + assert( f != nullptr ); + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + *f = V.vector4_f32[i]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + *f = V.n128_f32[i]; +#elif defined(_XM_SSE_INTRINSICS_) + *f = V.m128_f32[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetXPtr(float *x, FXMVECTOR V) +{ + assert( x != nullptr); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_f32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(x,V); +#endif +} + +// Store the Y component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetYPtr(float *y, FXMVECTOR V) +{ + assert( y != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(y,V,1); +#elif defined(_XM_SSE4_INTRINSICS_) + *((int*)y) = _mm_extract_ps( V, 1 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + _mm_store_ss(y,vResult); +#endif +} + +// Store the Z component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetZPtr(float *z, FXMVECTOR V) +{ + assert( z != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(z,V,2); +#elif defined(_XM_SSE4_INTRINSICS_) + *((int*)z) = _mm_extract_ps( V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(z,vResult); +#endif +} + +// Store the W component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetWPtr(float *w, FXMVECTOR V) +{ + assert( w != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(w,V,3); +#elif defined(_XM_SSE4_INTRINSICS_) + *((int*)w) = _mm_extract_ps( V, 3 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + _mm_store_ss(w,vResult); +#endif +} + +//------------------------------------------------------------------------------ + +// Return an integer value via an index. This is not a recommended +// function to use due to performance loss. +inline uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) +{ + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[i]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return V.n128_u32[i]; +#elif defined(_XM_SSE_INTRINSICS_) + return V.m128_u32[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Return the X component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + return static_cast(_mm_cvtsi128_si32(_mm_castps_si128(V))); +#endif +} + +// Return the Y component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(V, 1); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128( V ); + return static_cast( _mm_extract_epi32( V1, 1 ) ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +// Return the Z component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(V, 2); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128( V ); + return static_cast( _mm_extract_epi32( V1, 2 ) ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +// Return the W component in an integer register. +inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(V, 3); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128( V ); + return static_cast( _mm_extract_epi32( V1, 3 ) ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3)); + return static_cast(_mm_cvtsi128_si32(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntByIndexPtr(uint32_t *x, FXMVECTOR V, size_t i) +{ + assert( x != nullptr ); + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[i]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + *x = V.n128_u32[i]; +#elif defined(_XM_SSE_INTRINSICS_) + *x = V.m128_u32[i]; +#endif +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntXPtr(uint32_t *x, FXMVECTOR V) +{ + assert( x != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(x,*reinterpret_cast(&V),0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(reinterpret_cast(x),V); +#endif +} + +// Store the Y component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V) +{ + assert( y != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(y,*reinterpret_cast(&V),1); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128( V ); + *y = static_cast( _mm_extract_epi32( V1, 1 ) ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + _mm_store_ss(reinterpret_cast(y),vResult); +#endif +} + +// Store the Z component into a 32 bit integer locaCantion in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V) +{ + assert( z != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(z,*reinterpret_cast(&V),2); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128( V ); + *z = static_cast( _mm_extract_epi32( V1, 2 ) ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(reinterpret_cast(z),vResult); +#endif +} + +// Store the W component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V) +{ + assert( w != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(w,*reinterpret_cast(&V),3); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i V1 = _mm_castps_si128( V ); + *w = static_cast( _mm_extract_epi32( V1, 3 ) ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + _mm_store_ss(reinterpret_cast(w),vResult); +#endif +} + +//------------------------------------------------------------------------------ + +// Set a single indexed floating point component +inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) +{ + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U = V; + U.vector4_f32[i] = f; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR U = V; + U.n128_f32[i] = f; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR U = V; + U.m128_f32[i] = f; + return U; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = x; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(x); + vResult = _mm_move_ss(V,vResult); + return vResult; +#endif +} + +// Sets the Y component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = y; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(y,V,1); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(y); + vResult = _mm_insert_ps( V, vResult, 0x10 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#endif +} +// Sets the Z component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = z; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(z,V,2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(z); + vResult = _mm_insert_ps( V, vResult, 0x20 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#endif +} + +// Sets the W component of a vector to a passed floating point value +inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = w; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(w,V,3); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(w); + vResult = _mm_insert_ps( V, vResult, 0x30 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(FXMVECTOR V, const float *f, size_t i) +{ + assert( f != nullptr ); + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U = V; + U.vector4_f32[i] = *f; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR U = V; + U.n128_f32[i] = *f; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR U = V; + U.m128_f32[i] = *f; + return U; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetXPtr(FXMVECTOR V, const float *x) +{ + assert( x != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = *x; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_load_ss(x); + vResult = _mm_move_ss(V,vResult); + return vResult; +#endif +} + +// Sets the Y component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetYPtr(FXMVECTOR V, const float *y) +{ + assert( y != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = *y; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(y,V,1); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetZPtr(FXMVECTOR V, const float *z) +{ + assert( z != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = *z; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(z,V,2); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#endif +} + +// Sets the W component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetWPtr(FXMVECTOR V, const float *w) +{ + assert( w != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = *w; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(w,V,3); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) +{ + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U = V; + U.vector4_u32[i] = x; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = x; + return tmp; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = x; + return tmp; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = x; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_u32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cvtsi32_si128(x); + XMVECTOR vResult = _mm_move_ss(V,_mm_castsi128_ps(vTemp)); + return vResult; +#endif +} + +// Sets the Y component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = y; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_u32(y,V,1); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128( V ); + vResult = _mm_insert_epi32( vResult, static_cast(y), 1 ); + return _mm_castsi128_ps( vResult ); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(y); + // Replace the x component + vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = z; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_u32(z,V,2); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128( V ); + vResult = _mm_insert_epi32( vResult, static_cast(z), 2 ); + return _mm_castsi128_ps( vResult ); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(z); + // Replace the x component + vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#endif +} + +// Sets the W component of a vector to an integer passed by value +inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = w; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_u32(w,V,3); +#elif defined(_XM_SSE4_INTRINSICS_) + __m128i vResult = _mm_castps_si128( V ); + vResult = _mm_insert_epi32( vResult, static_cast(w), 3 ); + return _mm_castsi128_ps( vResult ); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(w); + // Replace the x component + vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t *x, size_t i) +{ + assert( x != nullptr ); + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U = V; + U.vector4_u32[i] = *x; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = *x; + return tmp; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = *x; + return tmp; +#endif +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t *x) +{ + assert( x != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = *x; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_u32(x,*reinterpret_cast(&V),0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(x)); + XMVECTOR vResult = _mm_move_ss(V,vTemp); + return vResult; +#endif +} + +// Sets the Y component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t *y) +{ + assert( y != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = *y; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_u32(y,*reinterpret_cast(&V),1); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(y)); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#endif +} + +// Sets the Z component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t *z) +{ + assert( z != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = *z; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_u32(z,*reinterpret_cast(&V),2); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(z)); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#endif +} + +// Sets the W component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t *w) +{ + assert( w != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = *w; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_u32(w,*reinterpret_cast(&V),3); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast(w)); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSwizzle +( + FXMVECTOR V, + uint32_t E0, + uint32_t E1, + uint32_t E2, + uint32_t E3 +) +{ + assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); + _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result = { V.vector4_f32[E0], + V.vector4_f32[E1], + V.vector4_f32[E2], + V.vector4_f32[E3] }; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const uint32_t ControlElement[ 4 ] = + { + 0x03020100, // XM_SWIZZLE_X + 0x07060504, // XM_SWIZZLE_Y + 0x0B0A0908, // XM_SWIZZLE_Z + 0x0F0E0D0C, // XM_SWIZZLE_W + }; + + int8x8x2_t tbl; + tbl.val[0] = vget_low_f32(V); + tbl.val[1] = vget_high_f32(V); + + uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[E0]) | (((uint64_t)ControlElement[E1]) << 32) ); + const uint8x8_t rL = vtbl2_u8( tbl, idx ); + + idx = vcreate_u32( ((uint64_t)ControlElement[E2]) | (((uint64_t)ControlElement[E3]) << 32) ); + const uint8x8_t rH = vtbl2_u8( tbl, idx ); + + return vcombine_f32( rL, rH ); +#elif defined(_XM_AVX_INTRINSICS_) + unsigned int elem[4] = { E0, E1, E2, E3 }; + __m128i vControl = _mm_loadu_si128( reinterpret_cast(&elem[0]) ); + return _mm_permutevar_ps( V, vControl ); +#else + const uint32_t *aPtr = (const uint32_t* )(&V); + + XMVECTOR Result; + uint32_t *pWork = (uint32_t*)(&Result); + + pWork[0] = aPtr[E0]; + pWork[1] = aPtr[E1]; + pWork[2] = aPtr[E2]; + pWork[3] = aPtr[E3]; + + return Result; +#endif +} + +//------------------------------------------------------------------------------ +inline XMVECTOR XM_CALLCONV XMVectorPermute +( + FXMVECTOR V1, + FXMVECTOR V2, + uint32_t PermuteX, + uint32_t PermuteY, + uint32_t PermuteZ, + uint32_t PermuteW +) +{ + assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const uint32_t ControlElement[ 8 ] = + { + 0x03020100, // XM_PERMUTE_0X + 0x07060504, // XM_PERMUTE_0Y + 0x0B0A0908, // XM_PERMUTE_0Z + 0x0F0E0D0C, // XM_PERMUTE_0W + 0x13121110, // XM_PERMUTE_1X + 0x17161514, // XM_PERMUTE_1Y + 0x1B1A1918, // XM_PERMUTE_1Z + 0x1F1E1D1C, // XM_PERMUTE_1W + }; + + int8x8x4_t tbl; + tbl.val[0] = vget_low_f32(V1); + tbl.val[1] = vget_high_f32(V1); + tbl.val[2] = vget_low_f32(V2); + tbl.val[3] = vget_high_f32(V2); + + uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[PermuteX]) | (((uint64_t)ControlElement[PermuteY]) << 32) ); + const uint8x8_t rL = vtbl4_u8( tbl, idx ); + + idx = vcreate_u32( ((uint64_t)ControlElement[PermuteZ]) | (((uint64_t)ControlElement[PermuteW]) << 32) ); + const uint8x8_t rH = vtbl4_u8( tbl, idx ); + + return vcombine_f32( rL, rH ); +#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORU32 three = { 3, 3, 3, 3 }; + + _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW }; + __m128i vControl = _mm_load_si128( reinterpret_cast(&elem[0]) ); + + __m128i vSelect = _mm_cmpgt_epi32( vControl, three ); + vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) ); + + __m128 shuffled1 = _mm_permutevar_ps( V1, vControl ); + __m128 shuffled2 = _mm_permutevar_ps( V2, vControl ); + + __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 ); + __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 ); + + return _mm_or_ps( masked1, masked2 ); +#else + + const uint32_t *aPtr[2]; + aPtr[0] = (const uint32_t* )(&V1); + aPtr[1] = (const uint32_t* )(&V2); + + XMVECTOR Result; + uint32_t *pWork = (uint32_t*)(&Result); + + const uint32_t i0 = PermuteX & 3; + const uint32_t vi0 = PermuteX >> 2; + pWork[0] = aPtr[vi0][i0]; + + const uint32_t i1 = PermuteY & 3; + const uint32_t vi1 = PermuteY >> 2; + pWork[1] = aPtr[vi1][i1]; + + const uint32_t i2 = PermuteZ & 3; + const uint32_t vi2 = PermuteZ >> 2; + pWork[2] = aPtr[vi2][i2]; + + const uint32_t i3 = PermuteW & 3; + const uint32_t vi3 = PermuteW >> 2; + pWork[3] = aPtr[vi3][i3]; + + return Result; +#endif +} + +//------------------------------------------------------------------------------ +// Define a control vector to be used in XMVectorSelect +// operations. The four integers specified in XMVectorSelectControl +// serve as indices to select between components in two vectors. +// The first index controls selection for the first component of +// the vectors involved in a select operation, the second index +// controls selection for the second component etc. A value of +// zero for an index causes the corresponding component from the first +// vector to be selected whereas a one causes the component from the +// second vector to be selected instead. + +inline XMVECTOR XM_CALLCONV XMVectorSelectControl +( + uint32_t VectorIndex0, + uint32_t VectorIndex1, + uint32_t VectorIndex2, + uint32_t VectorIndex3 +) +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + // x=Index0,y=Index1,z=Index2,w=Index3 + __m128i vTemp = _mm_set_epi32(VectorIndex3,VectorIndex2,VectorIndex1,VectorIndex0); + // Any non-zero entries become 0xFFFFFFFF else 0 + vTemp = _mm_cmpgt_epi32(vTemp,g_XMZero); + return _mm_castsi128_ps(vTemp); +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + int32x2_t V0 = vcreate_s32(((uint64_t)VectorIndex0) | ((uint64_t)VectorIndex1 << 32)); + int32x2_t V1 = vcreate_s32(((uint64_t)VectorIndex2) | ((uint64_t)VectorIndex3 << 32)); + int32x4_t vTemp = vcombine_s32(V0, V1); + // Any non-zero entries become 0xFFFFFFFF else 0 + return vcgtq_s32(vTemp,g_XMZero); +#else + XMVECTOR ControlVector; + const uint32_t ControlElement[] = + { + XM_SELECT_0, + XM_SELECT_1 + }; + + assert(VectorIndex0 < 2); + assert(VectorIndex1 < 2); + assert(VectorIndex2 < 2); + assert(VectorIndex3 < 2); + _Analysis_assume_(VectorIndex0 < 2); + _Analysis_assume_(VectorIndex1 < 2); + _Analysis_assume_(VectorIndex2 < 2); + _Analysis_assume_(VectorIndex3 < 2); + + ControlVector.vector4_u32[0] = ControlElement[VectorIndex0]; + ControlVector.vector4_u32[1] = ControlElement[VectorIndex1]; + ControlVector.vector4_u32[2] = ControlElement[VectorIndex2]; + ControlVector.vector4_u32[3] = ControlElement[VectorIndex3]; + + return ControlVector; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSelect +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Control +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]); + Result.vector4_u32[1] = (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]); + Result.vector4_u32[2] = (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]); + Result.vector4_u32[3] = (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vbslq_f32( Control, V2, V1 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_andnot_ps(Control,V1); + XMVECTOR vTemp2 = _mm_and_ps(V2,Control); + return _mm_or_ps(vTemp1,vTemp2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMergeXY +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = V1.vector4_u32[0]; + Result.vector4_u32[1] = V2.vector4_u32[0]; + Result.vector4_u32[2] = V1.vector4_u32[1]; + Result.vector4_u32[3] = V2.vector4_u32[1]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vzipq_f32( V1, V2 ).val[0]; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpacklo_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMergeZW +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = V1.vector4_u32[2]; + Result.vector4_u32[1] = V2.vector4_u32[2]; + Result.vector4_u32[2] = V1.vector4_u32[3]; + Result.vector4_u32[3] = V2.vector4_u32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vzipq_f32( V1, V2 ).val[1]; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpackhi_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3)); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements, + uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) +{ + XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1); + return XMVectorSelect( VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control ); +} + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vceqq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpeq_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorEqualR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + assert( pCR != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux&uy&uz&uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux|uy|uz|uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTOR Control; + Control.vector4_u32[0] = ux; + Control.vector4_u32[1] = uy; + Control.vector4_u32[2] = uz; + Control.vector4_u32[3] = uw; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are equal + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + // All elements are not equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +// Treat the components of the vectors as unsigned integers and +// compare individual bits between the two. This is useful for +// comparing control vectors and result vectors returned from +// other comparison operations. + +inline XMVECTOR XM_CALLCONV XMVectorEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vceqq_u32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorEqualIntR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + assert( pCR != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control = XMVectorEqualInt(V1, V2); + + *pCR = 0; + if (XMVector4EqualInt(Control, XMVectorTrueInt())) + { + // All elements are equal + *pCR |= XM_CRMASK_CR6TRUE; + } + else if (XMVector4EqualInt(Control, XMVectorFalseInt())) + { + // All elements are not equal + *pCR |= XM_CRMASK_CR6FALSE; + } + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are equal + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + // All elements are not equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); + int iTemp = _mm_movemask_ps(_mm_castsi128_ps(V)); + uint32_t CR = 0; + if (iTemp==0x0F) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fDeltax = V1.vector4_f32[0]-V2.vector4_f32[0]; + float fDeltay = V1.vector4_f32[1]-V2.vector4_f32[1]; + float fDeltaz = V1.vector4_f32[2]-V2.vector4_f32[2]; + float fDeltaw = V1.vector4_f32[3]-V2.vector4_f32[3]; + + fDeltax = fabsf(fDeltax); + fDeltay = fabsf(fDeltay); + fDeltaz = fabsf(fDeltaz); + fDeltaw = fabsf(fDeltaw); + + XMVECTOR Control; + Control.vector4_u32[0] = (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[1] = (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[2] = (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[3] = (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vDelta = vsubq_f32(V1,V2); + return vacleq_f32( vDelta, Epsilon ); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmvnq_u32(vceqq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpneq_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[1] = (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[2] = (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[3] = (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmvnq_u32(vceqq_u32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); + return _mm_xor_ps(_mm_castsi128_ps(V),g_XMNegOneMask); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorGreater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vcgtq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpgt_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorGreaterR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + assert( pCR != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux&uy&uz&uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux|uy|uz|uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTOR Control; + Control.vector4_u32[0] = ux; + Control.vector4_u32[1] = uy; + Control.vector4_u32[2] = uz; + Control.vector4_u32[3] = uw; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vcgeq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpge_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + assert( pCR != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux&uy&uz&uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux|uy|uz|uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTOR Control; + Control.vector4_u32[0] = ux; + Control.vector4_u32[1] = uy; + Control.vector4_u32[2] = uz; + Control.vector4_u32[3] = uw; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are greater or equal + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + // All elements are not greater or equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLess +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vcltq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmplt_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vcleq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmple_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorInBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = vcleq_f32(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vTemp2,V); + // Blend answers + vTemp1 = vandq_u32(vTemp1,vTemp2); + return vTemp1; +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + return vTemp1; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV XMVectorInBoundsR +( + uint32_t* pCR, + FXMVECTOR V, + FXMVECTOR Bounds +) +{ + assert( pCR != nullptr ); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + + uint32_t CR = 0; + if (ux&uy&uz&uw) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + + XMVECTOR Control; + Control.vector4_u32[0] = ux; + Control.vector4_u32[1] = uy; + Control.vector4_u32[2] = uz; + Control.vector4_u32[3] = uw; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = vcleq_f32(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vTemp2,V); + // Blend answers + vTemp1 = vandq_u32(vTemp1,vTemp2); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vTemp1; +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + + uint32_t CR = 0; + if (_mm_movemask_ps(vTemp1)==0xf) { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vTemp1; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorIsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[1] = XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[2] = XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[3] = XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32( V, V ); + // Flip results + return vmvnq_u32( vTempNan ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + return _mm_cmpneq_ps(V,V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorIsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[1] = XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[2] = XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[3] = XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTemp = vandq_u32(V,g_XMAbsMask); + // Compare to infinity + vTemp = vceqq_f32(vTemp,g_XMInfinity); + // If any are infinity, the signs are true. + return vTemp; +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If any are infinity, the signs are true. + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +// Rounding and clamping operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMin +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0]; + Result.vector4_f32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1]; + Result.vector4_f32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2]; + Result.vector4_f32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vminq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_min_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMax +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0]; + Result.vector4_f32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1]; + Result.vector4_f32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2]; + Result.vector4_f32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmaxq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_max_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +#ifdef _XM_NO_ROUNDF_ + +namespace Internal +{ + inline float round_to_nearest( float x ) + { + float i = floorf(x); + x -= i; + if(x < 0.5f) + return i; + if(x > 0.5f) + return i + 1.f; + + float int_part; + modff( i / 2.f, &int_part ); + if ( (2.f*int_part) == i ) + { + return i; + } + + return i + 1.f; + } +}; + +#endif + +#if !defined(_XM_NO_INTRINSICS_) +#pragma float_control(push) +#pragma float_control(precise, on) +#endif + +inline XMVECTOR XM_CALLCONV XMVectorRound +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + +#ifdef _XM_NO_ROUNDF_ + XMVECTOR Result; + Result.vector4_f32[0] = Internal::round_to_nearest( V.vector4_f32[0] ); + Result.vector4_f32[1] = Internal::round_to_nearest( V.vector4_f32[1] ); + Result.vector4_f32[2] = Internal::round_to_nearest( V.vector4_f32[2] ); + Result.vector4_f32[3] = Internal::round_to_nearest( V.vector4_f32[3] ); + return Result; +#else + XMVECTOR Result; + Result.vector4_f32[0] = roundf( V.vector4_f32[0] ); + Result.vector4_f32[1] = roundf( V.vector4_f32[1] ); + Result.vector4_f32[2] = roundf( V.vector4_f32[2] ); + Result.vector4_f32[3] = roundf( V.vector4_f32[3] ); + return Result; +#endif + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t sign = vandq_u32( V, g_XMNegativeZero ); + uint32x4_t sMagic = vorrq_u32( g_XMNoFraction, sign ); + float32x4_t R1 = vaddq_f32( V, sMagic ); + R1 = vsubq_f32( R1, sMagic ); + float32x4_t R2 = vabsq_f32( V ); + uint32x4_t mask = vcleq_f32( R2, g_XMNoFraction ); + XMVECTOR vResult = vbslq_f32( mask, R1, V ); + return vResult; +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 sign = _mm_and_ps( V, g_XMNegativeZero ); + __m128 sMagic = _mm_or_ps( g_XMNoFraction, sign ); + __m128 R1 = _mm_add_ps( V, sMagic ); + R1 = _mm_sub_ps( R1, sMagic ); + __m128 R2 = _mm_and_ps( V, g_XMAbsMask ); + __m128 mask = _mm_cmple_ps( R2, g_XMNoFraction ); + R2 = _mm_andnot_ps(mask,V); + R1 = _mm_and_ps(R1,mask); + XMVECTOR vResult = _mm_xor_ps(R1, R2); + return vResult; +#endif +} + +#if !defined(_XM_NO_INTRINSICS_) +#pragma float_control(pop) +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTruncate +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + uint32_t i; + + // Avoid C4701 + Result.vector4_f32[0] = 0.0f; + + for (i = 0; i < 4; i++) + { + if (XMISNAN(V.vector4_f32[i])) + { + Result.vector4_u32[i] = 0x7FC00000; + } + else if (fabsf(V.vector4_f32[i]) < 8388608.0f) + { + Result.vector4_f32[i] = (float)((int32_t)V.vector4_f32[i]); + } + else + { + Result.vector4_f32[i] = V.vector4_f32[i]; + } + } + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTest = vabsq_f32( V ); + vTest = vcltq_f32( vTest, g_XMNoFraction ); + + int32x4_t vInt = vcvtq_s32_f32( V ); + XMVECTOR vResult = vcvtq_f32_s32( vInt ); + + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32( vTest, vResult, V ); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Convert to int and back to float for rounding with truncation + __m128i vInt = _mm_cvttps_epi32(V); + // Convert back to floats + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); + vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorFloor +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = floorf( V.vector4_f32[0] ); + Result.vector4_f32[1] = floorf( V.vector4_f32[1] ); + Result.vector4_f32[2] = floorf( V.vector4_f32[2] ); + Result.vector4_f32[3] = floorf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTest = vabsq_f32( V ); + vTest = vcltq_f32( vTest, g_XMNoFraction ); + // Truncate + int32x4_t vInt = vcvtq_s32_f32( V ); + XMVECTOR vResult = vcvtq_f32_s32( vInt ); + XMVECTOR vLarger = vcgtq_f32( vResult, V ); + // 0 -> 0, 0xffffffff -> -1.0f + vLarger = vcvtq_f32_s32( vLarger ); + vResult = vaddq_f32( vResult, vLarger ); + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32( vTest, vResult, V ); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_floor_ps( V ); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Truncate + __m128i vInt = _mm_cvttps_epi32(V); + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + __m128 vLarger = _mm_cmpgt_ps( vResult, V ); + // 0 -> 0, 0xffffffff -> -1.0f + vLarger = _mm_cvtepi32_ps( _mm_castps_si128( vLarger ) ); + vResult = _mm_add_ps( vResult, vLarger ); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); + vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCeiling +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = ceilf( V.vector4_f32[0] ); + Result.vector4_f32[1] = ceilf( V.vector4_f32[1] ); + Result.vector4_f32[2] = ceilf( V.vector4_f32[2] ); + Result.vector4_f32[3] = ceilf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTest = vabsq_f32( V ); + vTest = vcltq_f32( vTest, g_XMNoFraction ); + // Truncate + int32x4_t vInt = vcvtq_s32_f32( V ); + XMVECTOR vResult = vcvtq_f32_s32( vInt ); + XMVECTOR vSmaller = vcltq_f32( vResult, V ); + // 0 -> 0, 0xffffffff -> -1.0f + vSmaller = vcvtq_f32_s32( vSmaller ); + vResult = vsubq_f32( vResult, vSmaller ); + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32( vTest, vResult, V ); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_ceil_ps( V ); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Truncate + __m128i vInt = _mm_cvttps_epi32(V); + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + __m128 vSmaller = _mm_cmplt_ps( vResult, V ); + // 0 -> 0, 0xffffffff -> -1.0f + vSmaller = _mm_cvtepi32_ps( _mm_castps_si128( vSmaller ) ); + vResult = _mm_sub_ps( vResult, vSmaller ); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest)); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); + vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorClamp +( + FXMVECTOR V, + FXMVECTOR Min, + FXMVECTOR Max +) +{ + assert(XMVector4LessOrEqual(Min, Max)); + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVectorMax(Min, V); + Result = XMVectorMin(Max, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult; + vResult = vmaxq_f32(Min,V); + vResult = vminq_f32(vResult,Max); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult; + vResult = _mm_max_ps(Min,V); + vResult = _mm_min_ps(vResult,Max); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSaturate +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + return XMVectorClamp(V, Zero, g_XMOne.v); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = vmaxq_f32(V, vdupq_n_f32(0) ); + // Set>1 to 1 + return vminq_f32(vResult, vdupq_n_f32(1.0f) ); +#elif defined(_XM_SSE_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Set>1 to 1 + return _mm_min_ps(vResult,g_XMOne); +#endif +} + +//------------------------------------------------------------------------------ +// Bitwise logical operations +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAndInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = V1.vector4_u32[0] & V2.vector4_u32[0]; + Result.vector4_u32[1] = V1.vector4_u32[1] & V2.vector4_u32[1]; + Result.vector4_u32[2] = V1.vector4_u32[2] & V2.vector4_u32[2]; + Result.vector4_u32[3] = V1.vector4_u32[3] & V2.vector4_u32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vandq_u32(V1,V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_and_ps(V1,V2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAndCInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = V1.vector4_u32[0] & ~V2.vector4_u32[0]; + Result.vector4_u32[1] = V1.vector4_u32[1] & ~V2.vector4_u32[1]; + Result.vector4_u32[2] = V1.vector4_u32[2] & ~V2.vector4_u32[2]; + Result.vector4_u32[3] = V1.vector4_u32[3] & ~V2.vector4_u32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vbicq_u32(V1,V2); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_andnot_si128( _mm_castps_si128(V2), _mm_castps_si128(V1) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorOrInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = V1.vector4_u32[0] | V2.vector4_u32[0]; + Result.vector4_u32[1] = V1.vector4_u32[1] | V2.vector4_u32[1]; + Result.vector4_u32[2] = V1.vector4_u32[2] | V2.vector4_u32[2]; + Result.vector4_u32[3] = V1.vector4_u32[3] | V2.vector4_u32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vorrq_u32(V1,V2); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = ~(V1.vector4_u32[0] | V2.vector4_u32[0]); + Result.vector4_u32[1] = ~(V1.vector4_u32[1] | V2.vector4_u32[1]); + Result.vector4_u32[2] = ~(V1.vector4_u32[2] | V2.vector4_u32[2]); + Result.vector4_u32[3] = ~(V1.vector4_u32[3] | V2.vector4_u32[3]); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t Result = vorrq_u32(V1,V2); + return vbicq_u32(g_XMNegOneMask, Result); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i Result; + Result = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); + Result = _mm_andnot_si128( Result,g_XMNegOneMask); + return _mm_castsi128_ps(Result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorXorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = V1.vector4_u32[0] ^ V2.vector4_u32[0]; + Result.vector4_u32[1] = V1.vector4_u32[1] ^ V2.vector4_u32[1]; + Result.vector4_u32[2] = V1.vector4_u32[2] ^ V2.vector4_u32[2]; + Result.vector4_u32[3] = V1.vector4_u32[3] ^ V2.vector4_u32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return veorq_u32(V1,V2); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_xor_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); + return _mm_castsi128_ps(V); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNegate +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = -V.vector4_f32[0]; + Result.vector4_f32[1] = -V.vector4_f32[1]; + Result.vector4_f32[2] = -V.vector4_f32[2]; + Result.vector4_f32[3] = -V.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vnegq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Z; + + Z = _mm_setzero_ps(); + + return _mm_sub_ps( Z, V ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAdd +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = V1.vector4_f32[0] + V2.vector4_f32[0]; + Result.vector4_f32[1] = V1.vector4_f32[1] + V2.vector4_f32[1]; + Result.vector4_f32[2] = V1.vector4_f32[2] + V2.vector4_f32[2]; + Result.vector4_f32[3] = V1.vector4_f32[3] + V2.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vaddq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_add_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSum +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = + Result.vector4_f32[1] = + Result.vector4_f32[2] = + Result.vector4_f32[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v1 = vget_low_f32(V); + float32x2_t v2 = vget_high_f32(V); + v1 = vadd_f32(v1, v2); + v1 = vpadd_f32(v1, v1); + return vcombine_f32(v1, v1); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_hadd_ps(V, V); + return _mm_hadd_ps(vTemp,vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1)); + XMVECTOR vTemp2 = _mm_add_ps(V, vTemp); + vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2)); + return _mm_add_ps(vTemp, vTemp2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAddAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + // Add the given angles together. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + XMVECTOR Result = XMVectorAdd(V1, V2); + + XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); + XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = vaddq_f32(V1,V2); + // Less than Pi? + uint32x4_t vOffset = vcltq_f32(vResult,g_XMNegativePi); + vOffset = vandq_u32(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = vaddq_f32(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = vcgeq_f32(vResult,g_XMPi); + vOffset = vandq_u32(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = vsubq_f32(vResult,vOffset); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_add_ps(V1,V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult,g_XMPi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult,vOffset); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSubtract +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = V1.vector4_f32[0] - V2.vector4_f32[0]; + Result.vector4_f32[1] = V1.vector4_f32[1] - V2.vector4_f32[1]; + Result.vector4_f32[2] = V1.vector4_f32[2] - V2.vector4_f32[2]; + Result.vector4_f32[3] = V1.vector4_f32[3] - V2.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsubq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sub_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSubtractAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + // Subtract the given angles. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + XMVECTOR Result = XMVectorSubtract(V1, V2); + + XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); + XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = vsubq_f32(V1,V2); + // Less than Pi? + uint32x4_t vOffset = vcltq_f32(vResult,g_XMNegativePi); + vOffset = vandq_u32(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = vaddq_f32(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = vcgeq_f32(vResult,g_XMPi); + vOffset = vandq_u32(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = vsubq_f32(vResult,vOffset); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_sub_ps(V1,V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult,g_XMPi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult,vOffset); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMultiply +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = V1.vector4_f32[0] * V2.vector4_f32[0]; + Result.vector4_f32[1] = V1.vector4_f32[1] * V2.vector4_f32[1]; + Result.vector4_f32[2] = V1.vector4_f32[2] * V2.vector4_f32[2]; + Result.vector4_f32[3] = V1.vector4_f32[3] * V2.vector4_f32[3]; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmulq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_mul_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0]; + Result.vector4_f32[1] = V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1]; + Result.vector4_f32[2] = V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2]; + Result.vector4_f32[3] = V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3]; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmlaq_f32( V3, V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_mul_ps( V1, V2 ); + return _mm_add_ps(vResult, V3 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorDivide +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = V1.vector4_f32[0] / V2.vector4_f32[0]; + Result.vector4_f32[1] = V1.vector4_f32[1] / V2.vector4_f32[1]; + Result.vector4_f32[2] = V1.vector4_f32[2] / V2.vector4_f32[2]; + Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3]; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(V2); + float32x4_t S = vrecpsq_f32( Reciprocal, V2 ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, V2 ); + Reciprocal = vmulq_f32( S, Reciprocal ); + return vmulq_f32( V1, Reciprocal ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps( V1, V2 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]); + Result.vector4_f32[1] = V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]); + Result.vector4_f32[2] = V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]); + Result.vector4_f32[3] = V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3]); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmlsq_f32( V3, V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R = _mm_mul_ps( V1, V2 ); + return _mm_sub_ps( V3, R ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorScale +( + FXMVECTOR V, + float ScaleFactor +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = V.vector4_f32[0] * ScaleFactor; + Result.vector4_f32[1] = V.vector4_f32[1] * ScaleFactor; + Result.vector4_f32[2] = V.vector4_f32[2] * ScaleFactor; + Result.vector4_f32[3] = V.vector4_f32[3] * ScaleFactor; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmulq_n_f32( V, ScaleFactor ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ps1(ScaleFactor); + return _mm_mul_ps(vResult,V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = 1.f / V.vector4_f32[0]; + Result.vector4_f32[1] = 1.f / V.vector4_f32[1]; + Result.vector4_f32[2] = 1.f / V.vector4_f32[2]; + Result.vector4_f32[3] = 1.f / V.vector4_f32[3]; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vrecpeq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rcp_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = 1.f / V.vector4_f32[0]; + Result.vector4_f32[1] = 1.f / V.vector4_f32[1]; + Result.vector4_f32[2] = 1.f / V.vector4_f32[2]; + Result.vector4_f32[3] = 1.f / V.vector4_f32[3]; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 2 iterations of Newton-Raphson refinement + float32x4_t Reciprocal = vrecpeq_f32(V); + float32x4_t S = vrecpsq_f32( Reciprocal, V ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, V ); + return vmulq_f32( S, Reciprocal ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps(g_XMOne,V); +#endif +} + +//------------------------------------------------------------------------------ +// Return an estimated square root +inline XMVECTOR XM_CALLCONV XMVectorSqrtEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] ); + Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] ); + Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] ); + Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 1 iteration of Newton-Raphson refinment of sqrt + float32x4_t S0 = vrsqrteq_f32(V); + float32x4_t P0 = vmulq_f32( V, S0 ); + float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); + float32x4_t S1 = vmulq_f32( S0, R0 ); + + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) ); + XMVECTOR Result = vmulq_f32( V, S1 ); + XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + return XMVectorSelect(V, Result, Select); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSqrt +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] ); + Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] ); + Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] ); + Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 3 iterations of Newton-Raphson refinment of sqrt + float32x4_t S0 = vrsqrteq_f32(V); + float32x4_t P0 = vmulq_f32( V, S0 ); + float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); + float32x4_t S1 = vmulq_f32( S0, R0 ); + float32x4_t P1 = vmulq_f32( V, S1 ); + float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); + float32x4_t S2 = vmulq_f32( S1, R1 ); + float32x4_t P2 = vmulq_f32( V, S2 ); + float32x4_t R2 = vrsqrtsq_f32( P2, S2 ); + float32x4_t S3 = vmulq_f32( S2, R2 ); + + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) ); + XMVECTOR Result = vmulq_f32( V, S3 ); + XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + return XMVectorSelect(V, Result, Select); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] ); + Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] ); + Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] ); + Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vrsqrteq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rsqrt_ps(V); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] ); + Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] ); + Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] ); + Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t S0 = vrsqrteq_f32(V); + + float32x4_t P0 = vmulq_f32( V, S0 ); + float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); + + float32x4_t S1 = vmulq_f32( S0, R0 ); + float32x4_t P1 = vmulq_f32( V, S1 ); + float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); + + return vmulq_f32( S1, R1 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_sqrt_ps(V); + vResult = _mm_div_ps(g_XMOne,vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp2 +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = powf(2.0f, V.vector4_f32[0]); + Result.vector4_f32[1] = powf(2.0f, V.vector4_f32[1]); + Result.vector4_f32[2] = powf(2.0f, V.vector4_f32[2]); + Result.vector4_f32[3] = powf(2.0f, V.vector4_f32[3]); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t itrunc = vcvtq_s32_f32(V); + float32x4_t ftrunc = vcvtq_f32_s32(itrunc); + float32x4_t y = vsubq_f32(V, ftrunc); + + float32x4_t poly = vmlaq_f32( g_XMExpEst6, g_XMExpEst7, y ); + poly = vmlaq_f32( g_XMExpEst5, poly, y ); + poly = vmlaq_f32( g_XMExpEst4, poly, y ); + poly = vmlaq_f32( g_XMExpEst3, poly, y ); + poly = vmlaq_f32( g_XMExpEst2, poly, y ); + poly = vmlaq_f32( g_XMExpEst1, poly, y ); + poly = vmlaq_f32( g_XMOne, poly, y ); + + int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias); + biased = vshlq_n_s32(biased, 23); + float32x4_t result0 = XMVectorDivide(biased, poly); + + biased = vaddq_s32(itrunc, g_XM253); + biased = vshlq_n_s32(biased, 23); + float32x4_t result1 = XMVectorDivide(biased, poly); + result1 = vmulq_f32(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + int32x4_t comp = vcltq_s32( V, g_XMBin128); + float32x4_t result2 = vbslq_f32( comp, result0, g_XMInfinity ); + + comp = vcltq_s32(itrunc, g_XMSubnormalExponent); + float32x4_t result3 = vbslq_f32( comp, result1, result0 ); + + comp = vcltq_s32(V, g_XMBinNeg150); + float32x4_t result4 = vbslq_f32( comp, result3, g_XMZero ); + + int32x4_t sign = vandq_s32(V, g_XMNegativeZero); + comp = vceqq_s32(sign, g_XMNegativeZero); + float32x4_t result5 = vbslq_f32( comp, result4, result2 ); + + int32x4_t t0 = vandq_s32(V, g_XMQNaNTest); + int32x4_t t1 = vandq_s32(V, g_XMInfinity); + t0 = vceqq_s32(t0, g_XMZero); + t1 = vceqq_s32(t1, g_XMInfinity); + int32x4_t isNaN = vbicq_s32( t1,t0); + + float32x4_t vResult = vbslq_f32( isNaN, g_XMQNaN, result5 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i itrunc = _mm_cvttps_epi32(V); + __m128 ftrunc = _mm_cvtepi32_ps(itrunc); + __m128 y = _mm_sub_ps(V, ftrunc); + __m128 poly = _mm_mul_ps(g_XMExpEst7, y); + poly = _mm_add_ps(g_XMExpEst6, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst5, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst4, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst3, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst2, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst1, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMOne, poly); + + __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias); + biased = _mm_slli_epi32(biased, 23); + __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + + biased = _mm_add_epi32(itrunc, g_XM253); + biased = _mm_slli_epi32(biased, 23); + __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + result1 = _mm_mul_ps(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + __m128i comp = _mm_cmplt_epi32( _mm_castps_si128(V), g_XMBin128); + __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0)); + __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity); + __m128i result2 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent); + select1 = _mm_and_si128(comp, _mm_castps_si128(result1)); + select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0)); + __m128i result3 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBinNeg150); + select0 = _mm_and_si128(comp, result3); + select1 = _mm_andnot_si128(comp, g_XMZero); + __m128i result4 = _mm_or_si128(select0, select1); + + __m128i sign = _mm_and_si128(_mm_castps_si128(V), g_XMNegativeZero); + comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero); + select0 = _mm_and_si128(comp, result4); + select1 = _mm_andnot_si128(comp, result2); + __m128i result5 = _mm_or_si128(select0, select1); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result5); + __m128i vResult = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(vResult); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExpE +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = expf(V.vector4_f32[0]); + Result.vector4_f32[1] = expf(V.vector4_f32[1]); + Result.vector4_f32[2] = expf(V.vector4_f32[2]); + Result.vector4_f32[3] = expf(V.vector4_f32[3]); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // expE(V) = exp2(vin*log2(e)) + float32x4_t Ve = vmulq_f32(g_XMLgE, V); + + int32x4_t itrunc = vcvtq_s32_f32(Ve); + float32x4_t ftrunc = vcvtq_f32_s32(itrunc); + float32x4_t y = vsubq_f32(Ve, ftrunc); + + + float32x4_t poly = vmlaq_f32( g_XMExpEst6, g_XMExpEst7, y ); + poly = vmlaq_f32( g_XMExpEst5, poly, y ); + poly = vmlaq_f32( g_XMExpEst4, poly, y ); + poly = vmlaq_f32( g_XMExpEst3, poly, y ); + poly = vmlaq_f32( g_XMExpEst2, poly, y ); + poly = vmlaq_f32( g_XMExpEst1, poly, y ); + poly = vmlaq_f32( g_XMOne, poly, y ); + + int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias); + biased = vshlq_n_s32(biased, 23); + float32x4_t result0 = XMVectorDivide(biased, poly); + + biased = vaddq_s32(itrunc, g_XM253); + biased = vshlq_n_s32(biased, 23); + float32x4_t result1 = XMVectorDivide(biased, poly); + result1 = vmulq_f32(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + int32x4_t comp = vcltq_s32( Ve, g_XMBin128); + float32x4_t result2 = vbslq_f32( comp, result0, g_XMInfinity ); + + comp = vcltq_s32(itrunc, g_XMSubnormalExponent); + float32x4_t result3 = vbslq_f32( comp, result1, result0 ); + + comp = vcltq_s32(Ve, g_XMBinNeg150); + float32x4_t result4 = vbslq_f32( comp, result3, g_XMZero ); + + int32x4_t sign = vandq_s32(Ve, g_XMNegativeZero); + comp = vceqq_s32(sign, g_XMNegativeZero); + float32x4_t result5 = vbslq_f32( comp, result4, result2 ); + + int32x4_t t0 = vandq_s32(Ve, g_XMQNaNTest); + int32x4_t t1 = vandq_s32(Ve, g_XMInfinity); + t0 = vceqq_s32(t0, g_XMZero); + t1 = vceqq_s32(t1, g_XMInfinity); + int32x4_t isNaN = vbicq_s32( t1,t0); + + float32x4_t vResult = vbslq_f32( isNaN, g_XMQNaN, result5 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // expE(V) = exp2(vin*log2(e)) + __m128 Ve = _mm_mul_ps(g_XMLgE, V); + + __m128i itrunc = _mm_cvttps_epi32(Ve); + __m128 ftrunc = _mm_cvtepi32_ps(itrunc); + __m128 y = _mm_sub_ps(Ve, ftrunc); + __m128 poly = _mm_mul_ps(g_XMExpEst7, y); + poly = _mm_add_ps(g_XMExpEst6, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst5, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst4, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst3, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst2, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMExpEst1, poly); + poly = _mm_mul_ps(poly, y); + poly = _mm_add_ps(g_XMOne, poly); + + __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias); + biased = _mm_slli_epi32(biased, 23); + __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + + biased = _mm_add_epi32(itrunc, g_XM253); + biased = _mm_slli_epi32(biased, 23); + __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly); + result1 = _mm_mul_ps(g_XMMinNormal.v, result1); + + // Use selection to handle the cases + // if (V is NaN) -> QNaN; + // else if (V sign bit set) + // if (V > -150) + // if (V.exponent < -126) -> result1 + // else -> result0 + // else -> +0 + // else + // if (V < 128) -> result0 + // else -> +inf + + __m128i comp = _mm_cmplt_epi32( _mm_castps_si128(Ve), g_XMBin128); + __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0)); + __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity); + __m128i result2 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent); + select1 = _mm_and_si128(comp, _mm_castps_si128(result1)); + select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0)); + __m128i result3 = _mm_or_si128(select0, select1); + + comp = _mm_cmplt_epi32(_mm_castps_si128(Ve), g_XMBinNeg150); + select0 = _mm_and_si128(comp, result3); + select1 = _mm_andnot_si128(comp, g_XMZero); + __m128i result4 = _mm_or_si128(select0, select1); + + __m128i sign = _mm_and_si128(_mm_castps_si128(Ve), g_XMNegativeZero); + comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero); + select0 = _mm_and_si128(comp, result4); + select1 = _mm_andnot_si128(comp, result2); + __m128i result5 = _mm_or_si128(select0, select1); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(Ve), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(Ve), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result5); + __m128i vResult = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(vResult); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorExp +( + FXMVECTOR V +) +{ + return XMVectorExp2(V); +} + +//------------------------------------------------------------------------------ + +#if defined(_XM_SSE_INTRINSICS_) + +namespace Internal +{ + inline __m128i multi_sll_epi32(__m128i value, __m128i count) + { + __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0,0,0,0)); + __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0,0,0,0)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r0 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1,1,1,1)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1,1,1,1)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r1 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2,2,2,2)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2,2,2,2)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r2 = _mm_sll_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3,3,3,3)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3,3,3,3)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r3 = _mm_sll_epi32(v, c); + + // (r0,r0,r1,r1) + __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0,0,0,0)); + // (r2,r2,r3,r3) + __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0,0,0,0)); + // (r0,r1,r2,r3) + __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2,0,2,0)); + return _mm_castps_si128(result); + } + + inline __m128i multi_srl_epi32(__m128i value, __m128i count) + { + __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0,0,0,0)); + __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0,0,0,0)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r0 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1,1,1,1)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1,1,1,1)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r1 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2,2,2,2)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2,2,2,2)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r2 = _mm_srl_epi32(v, c); + + v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3,3,3,3)); + c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3,3,3,3)); + c = _mm_and_si128(c, g_XMMaskX); + __m128i r3 = _mm_srl_epi32(v, c); + + // (r0,r0,r1,r1) + __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0,0,0,0)); + // (r2,r2,r3,r3) + __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0,0,0,0)); + // (r0,r1,r2,r3) + __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2,0,2,0)); + return _mm_castps_si128(result); + } + + inline __m128i GetLeadingBit(const __m128i value) + { + static const XMVECTORI32 g_XM0000FFFF = {0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF}; + static const XMVECTORI32 g_XM000000FF = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF}; + static const XMVECTORI32 g_XM0000000F = {0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F}; + static const XMVECTORI32 g_XM00000003 = {0x00000003, 0x00000003, 0x00000003, 0x00000003}; + + __m128i v = value, r, c, b, s; + + c = _mm_cmpgt_epi32(v, g_XM0000FFFF); // c = (v > 0xFFFF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + r = _mm_slli_epi32(b, 4); // r = (b << 4) + v = multi_srl_epi32(v, r); // v = (v >> r) + + c = _mm_cmpgt_epi32(v, g_XM000000FF); // c = (v > 0xFF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 3); // s = (b << 3) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + c = _mm_cmpgt_epi32(v, g_XM0000000F); // c = (v > 0xF) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 2); // s = (b << 2) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + c = _mm_cmpgt_epi32(v, g_XM00000003); // c = (v > 0x3) + b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0) + s = _mm_slli_epi32(b, 1); // s = (b << 1) + v = multi_srl_epi32(v, s); // v = (v >> s) + r = _mm_or_si128(r, s); // r = (r | s) + + s = _mm_srli_epi32(v, 1); + r = _mm_or_si128(r, s); + return r; + } +} // namespace Internal + +#endif // _XM_SSE_INTRINSICS_ + +#if defined(_XM_ARM_NEON_INTRINSICS_) + +namespace Internal +{ + inline int32x4_t GetLeadingBit(const int32x4_t value) + { + static const XMVECTORI32 g_XM0000FFFF = {0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF}; + static const XMVECTORI32 g_XM000000FF = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF}; + static const XMVECTORI32 g_XM0000000F = {0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F}; + static const XMVECTORI32 g_XM00000003 = {0x00000003, 0x00000003, 0x00000003, 0x00000003}; + + int32x4_t v = value, r, c, b, s; + + c = vcgtq_s32(v, g_XM0000FFFF); // c = (v > 0xFFFF) + b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0) + r = vshlq_n_s32(b, 4); // r = (b << 4) + r = vnegq_s32( r ); + v = vshlq_u32( v, r ); // v = (v >> r) + + c = vcgtq_s32(v, g_XM000000FF); // c = (v > 0xFF) + b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0) + s = vshlq_n_s32(b, 3); // s = (b << 3) + s = vnegq_s32( s ); + v = vshlq_u32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + c = vcgtq_s32(v, g_XM0000000F); // c = (v > 0xF) + b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0) + s = vshlq_n_s32(b, 2); // s = (b << 2) + s = vnegq_s32( s ); + v = vshlq_u32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + c = vcgtq_s32(v, g_XM00000003); // c = (v > 0x3) + b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0) + s = vshlq_n_s32(b, 1); // s = (b << 1) + s = vnegq_s32( s ); + v = vshlq_u32(v, s); // v = (v >> s) + r = vorrq_s32(r, s); // r = (r | s) + + s = vshrq_n_u32(v, 1); + r = vorrq_s32(r, s); + return r; + } + +} // namespace Internal + +#endif + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog2 +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const float fScale = 1.4426950f; // (1.0f / logf(2.0f)); + + XMVECTOR Result; + Result.vector4_f32[0] = logf(V.vector4_f32[0])*fScale; + Result.vector4_f32[1] = logf(V.vector4_f32[1])*fScale; + Result.vector4_f32[2] = logf(V.vector4_f32[2])*fScale; + Result.vector4_f32[3] = logf(V.vector4_f32[3])*fScale; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(V, g_XMInfinity); + int32x4_t trailing = vandq_s32(V, g_XMQNaNTest); + int32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_u32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = Internal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_u32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_f32( isExponentZero, exponentSub, exponentNor ); + int32x4_t t = vbslq_f32( isExponentZero, trailingSub, trailingNor ); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(g_XMOne, t); + float32x4_t y = vsubq_f32(tmp, g_XMOne); + + float32x4_t log2 = vmlaq_f32( g_XMLogEst6, g_XMLogEst7, y ); + log2 = vmlaq_f32( g_XMLogEst5, log2, y ); + log2 = vmlaq_f32( g_XMLogEst4, log2, y ); + log2 = vmlaq_f32( g_XMLogEst3, log2, y ); + log2 = vmlaq_f32( g_XMLogEst2, log2, y ); + log2 = vmlaq_f32( g_XMLogEst1, log2, y ); + log2 = vmlaq_f32( g_XMLogEst0, log2, y ); + log2 = vmlaq_f32( vcvtq_f32_s32(e), log2, y ); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + int32x4_t isInfinite = vandq_s32((V), g_XMAbsMask); + isInfinite = vceqq_s32(isInfinite, g_XMInfinity); + + int32x4_t isGreaterZero = vcgtq_s32((V), g_XMZero); + int32x4_t isNotFinite = vcgtq_s32((V), g_XMInfinity); + int32x4_t isPositive = vbicq_s32( isGreaterZero,isNotFinite); + + int32x4_t isZero = vandq_s32((V), g_XMAbsMask); + isZero = vceqq_s32(isZero, g_XMZero); + + int32x4_t t0 = vandq_s32((V), g_XMQNaNTest); + int32x4_t t1 = vandq_s32((V), g_XMInfinity); + t0 = vceqq_s32(t0, g_XMZero); + t1 = vceqq_s32(t1, g_XMInfinity); + int32x4_t isNaN = vbicq_s32( t1,t0); + + float32x4_t result = vbslq_f32( isInfinite, g_XMInfinity, log2 ); + tmp = vbslq_f32( isZero, g_XMNegInfinity, g_XMNegQNaN ); + result = vbslq_f32(isPositive, result, tmp); + result = vbslq_f32(isNaN, g_XMQNaN, result ); + return result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = Internal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = _mm_mul_ps(g_XMLogEst7, y); + log2 = _mm_add_ps(g_XMLogEst6, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst5, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst4, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst3, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst2, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst1, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst0, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(e)); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLogE +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = logf(V.vector4_f32[0]); + Result.vector4_f32[1] = logf(V.vector4_f32[1]); + Result.vector4_f32[2] = logf(V.vector4_f32[2]); + Result.vector4_f32[3] = logf(V.vector4_f32[3]); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t rawBiased = vandq_s32(V, g_XMInfinity); + int32x4_t trailing = vandq_s32(V, g_XMQNaNTest); + int32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + int32x4_t biased = vshrq_n_u32(rawBiased, 23); + int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias); + int32x4_t trailingNor = trailing; + + // Compute exponent and significand for subnormals. + int32x4_t leading = Internal::GetLeadingBit(trailing); + int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading); + int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift); + int32x4_t trailingSub = vshlq_u32(trailing, shift); + trailingSub = vandq_s32(trailingSub, g_XMQNaNTest); + int32x4_t e = vbslq_f32( isExponentZero, exponentSub, exponentNor ); + int32x4_t t = vbslq_f32( isExponentZero, trailingSub, trailingNor ); + + // Compute the approximation. + int32x4_t tmp = vorrq_s32(g_XMOne, t); + float32x4_t y = vsubq_f32(tmp, g_XMOne); + + float32x4_t log2 = vmlaq_f32( g_XMLogEst6, g_XMLogEst7, y ); + log2 = vmlaq_f32( g_XMLogEst5, log2, y ); + log2 = vmlaq_f32( g_XMLogEst4, log2, y ); + log2 = vmlaq_f32( g_XMLogEst3, log2, y ); + log2 = vmlaq_f32( g_XMLogEst2, log2, y ); + log2 = vmlaq_f32( g_XMLogEst1, log2, y ); + log2 = vmlaq_f32( g_XMLogEst0, log2, y ); + log2 = vmlaq_f32( vcvtq_f32_s32(e), log2, y ); + + log2 = vmulq_f32(g_XMInvLgE, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + int32x4_t isInfinite = vandq_s32((V), g_XMAbsMask); + isInfinite = vceqq_s32(isInfinite, g_XMInfinity); + + int32x4_t isGreaterZero = vcgtq_s32((V), g_XMZero); + int32x4_t isNotFinite = vcgtq_s32((V), g_XMInfinity); + int32x4_t isPositive = vbicq_s32( isGreaterZero,isNotFinite); + + int32x4_t isZero = vandq_s32((V), g_XMAbsMask); + isZero = vceqq_s32(isZero, g_XMZero); + + int32x4_t t0 = vandq_s32((V), g_XMQNaNTest); + int32x4_t t1 = vandq_s32((V), g_XMInfinity); + t0 = vceqq_s32(t0, g_XMZero); + t1 = vceqq_s32(t1, g_XMInfinity); + int32x4_t isNaN = vbicq_s32( t1,t0); + + float32x4_t result = vbslq_f32( isInfinite, g_XMInfinity, log2 ); + tmp = vbslq_f32( isZero, g_XMNegInfinity, g_XMNegQNaN ); + result = vbslq_f32(isPositive, result, tmp); + result = vbslq_f32(isNaN, g_XMQNaN, result ); + return result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased); + + // Compute exponent and significand for normals. + __m128i biased = _mm_srli_epi32(rawBiased, 23); + __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias); + __m128i trailingNor = trailing; + + // Compute exponent and significand for subnormals. + __m128i leading = Internal::GetLeadingBit(trailing); + __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading); + __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift); + __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift); + trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest); + + __m128i select0 = _mm_and_si128(isExponentZero, exponentSub); + __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor); + __m128i e = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isExponentZero, trailingSub); + select1 = _mm_andnot_si128(isExponentZero, trailingNor); + __m128i t = _mm_or_si128(select0, select1); + + // Compute the approximation. + __m128i tmp = _mm_or_si128(g_XMOne, t); + __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne); + + __m128 log2 = _mm_mul_ps(g_XMLogEst7, y); + log2 = _mm_add_ps(g_XMLogEst6, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst5, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst4, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst3, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst2, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst1, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(g_XMLogEst0, log2); + log2 = _mm_mul_ps(log2, y); + log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(e)); + + log2 = _mm_mul_ps(g_XMInvLgE, log2); + + // if (x is NaN) -> QNaN + // else if (V is positive) + // if (V is infinite) -> +inf + // else -> log2(V) + // else + // if (V is zero) -> -inf + // else -> -QNaN + + __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity); + + __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero); + __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity); + __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero); + + __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask); + isZero = _mm_cmpeq_epi32(isZero, g_XMZero); + + __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest); + __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity); + t0 = _mm_cmpeq_epi32(t0, g_XMZero); + t1 = _mm_cmpeq_epi32(t1, g_XMInfinity); + __m128i isNaN = _mm_andnot_si128(t0, t1); + + select0 = _mm_and_si128(isInfinite, g_XMInfinity); + select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2)); + __m128i result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isZero, g_XMNegInfinity); + select1 = _mm_andnot_si128(isZero, g_XMNegQNaN); + tmp = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isPositive, result); + select1 = _mm_andnot_si128(isPositive, tmp); + result = _mm_or_si128(select0, select1); + + select0 = _mm_and_si128(isNaN, g_XMQNaN); + select1 = _mm_andnot_si128(isNaN, result); + result = _mm_or_si128(select0, select1); + + return _mm_castsi128_ps(result); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLog +( + FXMVECTOR V +) +{ + return XMVectorLog2(V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorPow +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = powf(V1.vector4_f32[0], V2.vector4_f32[0]); + Result.vector4_f32[1] = powf(V1.vector4_f32[1], V2.vector4_f32[1]); + Result.vector4_f32[2] = powf(V1.vector4_f32[2], V2.vector4_f32[2]); + Result.vector4_f32[3] = powf(V1.vector4_f32[3], V2.vector4_f32[3]); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)), + powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)), + powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)), + powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3)) + }; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + __declspec(align(16)) float a[4]; + __declspec(align(16)) float b[4]; + _mm_store_ps( a, V1 ); + _mm_store_ps( b, V2 ); + XMVECTOR vResult = _mm_setr_ps( + powf(a[0],b[0]), + powf(a[1],b[1]), + powf(a[2],b[2]), + powf(a[3],b[3])); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorAbs +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = fabsf(V.vector4_f32[0]); + vResult.vector4_f32[1] = fabsf(V.vector4_f32[1]); + vResult.vector4_f32[2] = fabsf(V.vector4_f32[2]); + vResult.vector4_f32[3] = fabsf(V.vector4_f32[3]); + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vabsq_f32( V ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_setzero_ps(); + vResult = _mm_sub_ps(vResult,V); + vResult = _mm_max_ps(vResult,V); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorMod +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + // V1 % V2 = V1 - V2 * truncate(V1 / V2) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Quotient = XMVectorDivide(V1, V2); + Quotient = XMVectorTruncate(Quotient); + XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult = XMVectorDivide(V1, V2); + vResult = XMVectorTruncate(vResult); + return vmlsq_f32( V1, vResult, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_div_ps(V1, V2); + vResult = XMVectorTruncate(vResult); + vResult = _mm_mul_ps(vResult,V2); + vResult = _mm_sub_ps(V1,vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorModAngles +( + FXMVECTOR Angles +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR Result; + + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v); + V = XMVectorRound(V); + Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = vmulq_f32(Angles,g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + return vmlsq_f32( Angles, vResult, g_XMTwoPi ); +#elif defined(_XM_SSE_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = _mm_mul_ps(Angles,g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + vResult = _mm_mul_ps(vResult,g_XMTwoPi); + vResult = _mm_sub_ps(Angles,vResult); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSin +( + FXMVECTOR V +) +{ + // 11-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = sinf( V.vector4_f32[0] ); + Result.vector4_f32[1] = sinf( V.vector4_f32[1] ); + Result.vector4_f32[2] = sinf( V.vector4_f32[2] ); + Result.vector4_f32[3] = sinf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32( x ); + float32x4_t rflx = vsubq_f32(c, x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR SC1 = g_XMSinCoefficients1; + const XMVECTOR SC0 = g_XMSinCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __m128 sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR SC1 = g_XMSinCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + const XMVECTOR SC0 = g_XMSinCoefficients0; + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCos +( + FXMVECTOR V +) +{ + // 10-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = cosf( V.vector4_f32[0] ); + Result.vector4_f32[1] = cosf( V.vector4_f32[1] ); + Result.vector4_f32[2] = cosf( V.vector4_f32[2] ); + Result.vector4_f32[3] = cosf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32( x ); + float32x4_t rflx = vsubq_f32(c, x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR CC1 = g_XMCosCoefficients1; + const XMVECTOR CC0 = g_XMCosCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0 ); + + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, sign); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR CC1 = g_XMCosCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, sign); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorSinCos +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) +{ + assert(pSin != nullptr); + assert(pCos != nullptr); + + // 11/10-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Sin; + Sin.vector4_f32[0] = sinf( V.vector4_f32[0] ); + Sin.vector4_f32[1] = sinf( V.vector4_f32[1] ); + Sin.vector4_f32[2] = sinf( V.vector4_f32[2] ); + Sin.vector4_f32[3] = sinf( V.vector4_f32[3] ); + + XMVECTOR Cos; + Cos.vector4_f32[0] = cosf( V.vector4_f32[0] ); + Cos.vector4_f32[1] = cosf( V.vector4_f32[1] ); + Cos.vector4_f32[2] = cosf( V.vector4_f32[2] ); + Cos.vector4_f32[3] = cosf( V.vector4_f32[3] ); + + *pSin = Sin; + *pCos = Cos; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32( x ); + float32x4_t rflx = vsubq_f32(c, x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SC1 = g_XMSinCoefficients1; + const XMVECTOR SC0 = g_XMSinCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pSin = vmulq_f32(Result, x); + + // Compute polynomial approximation for cosine + const XMVECTOR CC1 = g_XMCosCoefficients1; + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); + Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0); + + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pCos = vmulq_f32(Result, sign); +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation of sine + const XMVECTOR SC1 = g_XMSinCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + const XMVECTOR SC0 = g_XMSinCoefficients0; + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + *pSin = Result; + + // Compute polynomial approximation of cosine + const XMVECTOR CC1 = g_XMCosCoefficients1; + vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_mul_ps(vConstants, x2); + + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, sign); + *pCos = Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTan +( + FXMVECTOR V +) +{ + // Cody and Waite algorithm to compute tangent. + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = tanf( V.vector4_f32[0] ); + Result.vector4_f32[1] = tanf( V.vector4_f32[1] ); + Result.vector4_f32[2] = tanf( V.vector4_f32[2] ); + Result.vector4_f32[3] = tanf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 TanCoefficients0 = {1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f}; + static const XMVECTORF32 TanCoefficients1 = {4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f}; + static const XMVECTORF32 TanConstants = {1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ }; + static const XMVECTORU32 Mask = {0x1, 0x1, 0x1, 0x1}; + + XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v); + + XMVECTOR Zero = XMVectorZero(); + + XMVECTOR C0 = XMVectorSplatX(TanConstants.v); + XMVECTOR C1 = XMVectorSplatY(TanConstants.v); + XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v); + + XMVECTOR VA = XMVectorMultiply(V, TwoDivPi); + + VA = XMVectorRound(VA); + + XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V); + + XMVECTOR VB = XMVectorAbs(VA); + + VC = XMVectorNegativeMultiplySubtract(VA, C1, VC); + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + VB = vcvtq_u32_f32( VB ); +#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + reinterpret_cast<__m128i *>(&VB)[0] = _mm_cvttps_epi32(VB); +#else + for (size_t i = 0; i < 4; i++) + { + VB.vector4_u32[i] = (uint32_t)VB.vector4_f32[i]; + } +#endif + + XMVECTOR VC2 = XMVectorMultiply(VC, VC); + + XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v); + XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v); + XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v); + XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v); + XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v); + XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v); + XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v); + XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v); + + XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v); + VBIsEven = XMVectorEqualInt(VBIsEven, Zero); + + XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6); + XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3); + N = XMVectorMultiplyAdd(VC2, N, T5); + D = XMVectorMultiplyAdd(VC2, D, T2); + N = XMVectorMultiply(VC2, N); + D = XMVectorMultiplyAdd(VC2, D, T1); + N = XMVectorMultiplyAdd(VC, N, VC); + XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon); + D = XMVectorMultiplyAdd(VC2, D, T0); + + N = XMVectorSelect(N, VC, VCNearZero); + D = XMVectorSelect(D, g_XMOne.v, VCNearZero); + + XMVECTOR R0 = XMVectorNegate(N); + XMVECTOR R1 = XMVectorDivide(N,D); + R0 = XMVectorDivide(D,R0); + + XMVECTOR VIsZero = XMVectorEqual(V, Zero); + + XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven); + + Result = XMVectorSelect(Result, Zero, VIsZero); + + return Result; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSinH +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = sinhf( V.vector4_f32[0] ); + Result.vector4_f32[1] = sinhf( V.vector4_f32[1] ); + Result.vector4_f32[2] = sinhf( V.vector4_f32[2] ); + Result.vector4_f32[3] = sinhf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + XMVECTOR V1 = vmlaq_f32( g_XMNegativeOne.v, V, Scale.v ); + XMVECTOR V2 = vmlsq_f32( g_XMNegativeOne.v, V, Scale.v ); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + + return vsubq_f32(E1, E2); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + XMVECTOR V1 = _mm_mul_ps(V, Scale); + V1 = _mm_add_ps(V1,g_XMNegativeOne); + XMVECTOR V2 = _mm_mul_ps(V, Scale); + V2 = _mm_sub_ps(g_XMNegativeOne,V2); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + + return _mm_sub_ps(E1, E2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCosH +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = coshf( V.vector4_f32[0] ); + Result.vector4_f32[1] = coshf( V.vector4_f32[1] ); + Result.vector4_f32[2] = coshf( V.vector4_f32[2] ); + Result.vector4_f32[3] = coshf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + return vaddq_f32(E1, E2); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + XMVECTOR V1 = _mm_mul_ps(V,Scale.v); + V1 = _mm_add_ps(V1,g_XMNegativeOne.v); + XMVECTOR V2 = _mm_mul_ps(V, Scale.v); + V2 = _mm_sub_ps(g_XMNegativeOne.v,V2); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + return _mm_add_ps(E1, E2); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTanH +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = tanhf( V.vector4_f32[0] ); + Result.vector4_f32[1] = tanhf( V.vector4_f32[1] ); + Result.vector4_f32[2] = tanhf( V.vector4_f32[2] ); + Result.vector4_f32[3] = tanhf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f) + + XMVECTOR E = vmulq_f32(V, Scale.v); + E = XMVectorExp(E); + E = vmlaq_f32( g_XMOneHalf.v, E, g_XMOneHalf.v ); + E = XMVectorReciprocal(E); + return vsubq_f32(g_XMOne.v, E); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f) + + XMVECTOR E = _mm_mul_ps(V, Scale.v); + E = XMVectorExp(E); + E = _mm_mul_ps(E,g_XMOneHalf.v); + E = _mm_add_ps(E,g_XMOneHalf.v); + E = _mm_div_ps(g_XMOne.v,E); + return _mm_sub_ps(g_XMOne.v,E); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorASin +( + FXMVECTOR V +) +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = asinf( V.vector4_f32[0] ); + Result.vector4_f32[1] = asinf( V.vector4_f32[1] ); + Result.vector4_f32[2] = asinf( V.vector4_f32[2] ); + Result.vector4_f32[3] = asinf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); + XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AC1), 1 ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32( nonnegative, t0, t1 ); + t0 = vsubq_f32(g_XMHalfPi, t0); + return t0; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 t0 = _mm_mul_ps(vConstants, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0,_MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + t0 = _mm_sub_ps(g_XMHalfPi, t0); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorACos +( + FXMVECTOR V +) +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = acosf( V.vector4_f32[0] ); + Result.vector4_f32[1] = acosf( V.vector4_f32[1] ); + Result.vector4_f32[2] = acosf( V.vector4_f32[2] ); + Result.vector4_f32[3] = acosf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); + XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AC1), 1 ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32( nonnegative, t0, t1 ); + return t0; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 t0 = _mm_mul_ps(vConstants, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan +( + FXMVECTOR V +) +{ + // 17-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = atanf( V.vector4_f32[0] ); + Result.vector4_f32[1] = atanf( V.vector4_f32[1] ); + Result.vector4_f32[2] = atanf( V.vector4_f32[2] ); + Result.vector4_f32[3] = atanf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t absV = vabsq_f32(V); + float32x4_t invV = XMVectorReciprocal(V); + uint32x4_t comp = vcgtq_f32(V, g_XMOne); + uint32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + comp = vcleq_f32(absV, g_XMOne); + sign = vbslq_f32(comp, g_XMZero, sign); + uint32x4_t x = vbslq_f32(comp, V, invV); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR TC1 = g_XMATanCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0); + XMVECTOR Result = vmlaq_lane_f32( vConstants, x2, vget_high_f32(TC1), 1 ); + + vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + const XMVECTOR TC0 = g_XMATanCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + Result = vmlaq_f32( g_XMOne, Result, x2 ); + Result = vmulq_f32( Result, x ); + + float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); + result1 = vsubq_f32(result1, Result); + + comp = vceqq_f32(sign, g_XMZero); + Result = vbslq_f32( comp, Result, result1 ); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 absV = XMVectorAbs(V); + __m128 invV = _mm_div_ps(g_XMOne, V); + __m128 comp = _mm_cmpgt_ps(V, g_XMOne); + __m128 select0 = _mm_and_ps(comp, g_XMOne); + __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + __m128 sign = _mm_or_ps(select0, select1); + comp = _mm_cmple_ps(absV, g_XMOne); + select0 = _mm_and_ps(comp, g_XMZero); + select1 = _mm_andnot_ps(comp, sign); + sign = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, V); + select1 = _mm_andnot_ps(comp, invV); + __m128 x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR TC1 = g_XMATanCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + const XMVECTOR TC0 = g_XMATanCoefficients0; + vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); + result1 = _mm_sub_ps(result1, Result); + + comp = _mm_cmpeq_ps(sign, g_XMZero); + select0 = _mm_and_ps(comp, Result); + select1 = _mm_andnot_ps(comp, result1); + Result = _mm_or_ps(select0, select1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan2 +( + FXMVECTOR Y, + FXMVECTOR X +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = atan2f( Y.vector4_f32[0], X.vector4_f32[0] ); + Result.vector4_f32[1] = atan2f( Y.vector4_f32[1], X.vector4_f32[1] ); + Result.vector4_f32[2] = atan2f( Y.vector4_f32[2], X.vector4_f32[2] ); + Result.vector4_f32[3] = atan2f( Y.vector4_f32[3], X.vector4_f32[3] ); + return Result; +#else + + // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: + + // Y == 0 and X is Negative -> Pi with the sign of Y + // y == 0 and x is positive -> 0 with the sign of y + // Y != 0 and X == 0 -> Pi / 2 with the sign of Y + // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) + // X == -Infinity and Finite Y -> Pi with the sign of Y + // X == +Infinity and Finite Y -> 0 with the sign of Y + // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y + // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y + // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y + + static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}; + + XMVECTOR Zero = XMVectorZero(); + XMVECTOR ATanResultValid = XMVectorTrueInt(); + + XMVECTOR Pi = XMVectorSplatX(ATan2Constants); + XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); + XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); + XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); + XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); + XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + + XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); + XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); + XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + XMVECTOR V = XMVectorDivide(Y, X); + + XMVECTOR R0 = XMVectorATan(V); + + R1 = XMVectorSelect( Pi, g_XMNegativeZero, XIsPositive ); + R2 = XMVectorAdd(R0, R1); + + return XMVectorSelect(Result, R2, ATanResultValid); + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorSinEst +( + FXMVECTOR V +) +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = sinf( V.vector4_f32[0] ); + Result.vector4_f32[1] = sinf( V.vector4_f32[1] ); + Result.vector4_f32[2] = sinf( V.vector4_f32[2] ); + Result.vector4_f32[3] = sinf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32( x ); + float32x4_t rflx = vsubq_f32(c, x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __m128 sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCosEst +( + FXMVECTOR V +) +{ + // 6-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = cosf( V.vector4_f32[0] ); + Result.vector4_f32[1] = cosf( V.vector4_f32[1] ); + Result.vector4_f32[2] = cosf( V.vector4_f32[2] ); + Result.vector4_f32[3] = cosf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32( x ); + float32x4_t rflx = vsubq_f32(c, x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, sign); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, sign); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVectorSinCosEst +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) +{ + assert(pSin != nullptr); + assert(pCos != nullptr); + + // 7/6-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Sin; + Sin.vector4_f32[0] = sinf( V.vector4_f32[0] ); + Sin.vector4_f32[1] = sinf( V.vector4_f32[1] ); + Sin.vector4_f32[2] = sinf( V.vector4_f32[2] ); + Sin.vector4_f32[3] = sinf( V.vector4_f32[3] ); + + XMVECTOR Cos; + Cos.vector4_f32[0] = cosf( V.vector4_f32[0] ); + Cos.vector4_f32[1] = cosf( V.vector4_f32[1] ); + Cos.vector4_f32[2] = cosf( V.vector4_f32[2] ); + Cos.vector4_f32[3] = cosf( V.vector4_f32[3] ); + + *pSin = Sin; + *pCos = Cos; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + uint32x4_t sign = vandq_u32(x, g_XMNegativeZero); + uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + float32x4_t absx = vabsq_f32( x ); + float32x4_t rflx = vsubq_f32(c, x); + uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); + XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pSin = vmulq_f32(Result, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); + Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1); + + vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pCos = vmulq_f32(Result, sign); +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + *pSin = Result; + + // Compute polynomial approximation for cosine + const XMVECTOR CEC = g_XMCosCoefficients1; + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, sign); + *pCos = Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorTanEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = tanf( V.vector4_f32[0] ); + Result.vector4_f32[1] = tanf( V.vector4_f32[1] ); + Result.vector4_f32[2] = tanf( V.vector4_f32[2] ); + Result.vector4_f32[3] = tanf( V.vector4_f32[3] ); + return Result; +#else + + XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v); + + XMVECTOR V1 = XMVectorMultiply(V, OneOverPi); + V1 = XMVectorRound(V1); + + V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V); + + XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v); + XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v); + XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v); + + XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2); + XMVECTOR V2 = XMVectorMultiply(V1, V1); + XMVECTOR V1T0 = XMVectorMultiply(V1, T0); + XMVECTOR V1T1 = XMVectorMultiply(V1, T1); + + XMVECTOR D = XMVectorReciprocalEst(V2T2); + XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0); + + return XMVectorMultiply(N, D); + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorASinEst +( + FXMVECTOR V +) +{ + // 3-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = asinf( V.vector4_f32[0] ); + Result.vector4_f32[1] = asinf( V.vector4_f32[1] ); + Result.vector4_f32[2] = asinf( V.vector4_f32[2] ); + Result.vector4_f32[3] = asinf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AEC), 1 ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32( nonnegative, t0, t1 ); + t0 = vsubq_f32(g_XMHalfPi, t0); + return t0; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 t0 = _mm_mul_ps(vConstants, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + t0 = _mm_sub_ps(g_XMHalfPi, t0); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorACosEst +( + FXMVECTOR V +) +{ + // 3-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = acosf( V.vector4_f32[0] ); + Result.vector4_f32[1] = acosf( V.vector4_f32[1] ); + Result.vector4_f32[2] = acosf( V.vector4_f32[2] ); + Result.vector4_f32[3] = acosf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero); + float32x4_t x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + float32x4_t oneMValue = vsubq_f32(g_XMOne, x); + float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + float32x4_t root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AEC), 1 ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + t0 = vmulq_f32(t0, root); + + float32x4_t t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32( nonnegative, t0, t1 ); + return t0; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 t0 = _mm_mul_ps(vConstants, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + return t0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATanEst +( + FXMVECTOR V +) +{ + // 9-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = atanf( V.vector4_f32[0] ); + Result.vector4_f32[1] = atanf( V.vector4_f32[1] ); + Result.vector4_f32[2] = atanf( V.vector4_f32[2] ); + Result.vector4_f32[3] = atanf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t absV = vabsq_f32(V); + float32x4_t invV = XMVectorReciprocalEst(V); + uint32x4_t comp = vcgtq_f32(V, g_XMOne); + uint32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne ); + comp = vcleq_f32(absV, g_XMOne); + sign = vbslq_f32(comp, g_XMZero, sign ); + uint32x4_t x = vbslq_f32(comp, V, invV ); + + float32x4_t x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMATanEstCoefficients1; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + XMVECTOR Result = vmlaq_lane_f32( vConstants, x2, vget_high_f32(AEC), 1 ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32( AEC), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + // ATanEstCoefficients0 is already splatted + Result = vmlaq_f32( g_XMATanEstCoefficients0, Result, x2 ); + Result = vmulq_f32( Result, x ); + + float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); + result1 = vsubq_f32(result1, Result); + + comp = vceqq_f32(sign, g_XMZero); + Result = vbslq_f32( comp, Result, result1 ); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 absV = XMVectorAbs(V); + __m128 invV = _mm_div_ps(g_XMOne, V); + __m128 comp = _mm_cmpgt_ps(V, g_XMOne); + __m128 select0 = _mm_and_ps(comp, g_XMOne); + __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + __m128 sign = _mm_or_ps(select0, select1); + comp = _mm_cmple_ps(absV, g_XMOne); + select0 = _mm_and_ps(comp, g_XMZero); + select1 = _mm_andnot_ps(comp, sign); + sign = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, V); + select1 = _mm_andnot_ps(comp, invV); + __m128 x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMATanEstCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + // ATanEstCoefficients0 is already splatted + Result = _mm_add_ps(Result, g_XMATanEstCoefficients0); + Result = _mm_mul_ps(Result, x); + __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); + result1 = _mm_sub_ps(result1, Result); + + comp = _mm_cmpeq_ps(sign, g_XMZero); + select0 = _mm_and_ps(comp, Result); + select1 = _mm_andnot_ps(comp, result1); + Result = _mm_or_ps(select0, select1); + return Result; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorATan2Est +( + FXMVECTOR Y, + FXMVECTOR X +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = atan2f( Y.vector4_f32[0], X.vector4_f32[0] ); + Result.vector4_f32[1] = atan2f( Y.vector4_f32[1], X.vector4_f32[1] ); + Result.vector4_f32[2] = atan2f( Y.vector4_f32[2], X.vector4_f32[2] ); + Result.vector4_f32[3] = atan2f( Y.vector4_f32[3], X.vector4_f32[3] ); + return Result; +#else + + static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */}; + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR ATanResultValid = XMVectorTrueInt(); + + XMVECTOR Pi = XMVectorSplatX(ATan2Constants); + XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); + XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); + XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); + XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); + XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + + XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); + XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); + XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + XMVECTOR Reciprocal = XMVectorReciprocalEst(X); + XMVECTOR V = XMVectorMultiply(Y, Reciprocal); + XMVECTOR R0 = XMVectorATanEst(V); + + R1 = XMVectorSelect( Pi, g_XMNegativeZero, XIsPositive ); + R2 = XMVectorAdd(R0, R1); + + Result = XMVectorSelect(Result, R2, ATanResultValid); + + return Result; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLerp +( + FXMVECTOR V0, + FXMVECTOR V1, + float t +) +{ + // V0 + t * (V1 - V0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Scale = XMVectorReplicate(t); + XMVECTOR Length = XMVectorSubtract(V1, V0); + return XMVectorMultiplyAdd(Length, Scale, V0); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR L = vsubq_f32( V1, V0 ); + return vmlaq_n_f32( V0, L, t ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR L = _mm_sub_ps( V1, V0 ); + XMVECTOR S = _mm_set_ps1( t ); + XMVECTOR Result = _mm_mul_ps( L, S ); + return _mm_add_ps( Result, V0 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorLerpV +( + FXMVECTOR V0, + FXMVECTOR V1, + FXMVECTOR T +) +{ + // V0 + T * (V1 - V0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Length = XMVectorSubtract(V1, V0); + return XMVectorMultiplyAdd(Length, T, V0); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR L = vsubq_f32( V1, V0 ); + return vmlaq_f32( V0, L, T ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Length = _mm_sub_ps( V1, V0 ); + XMVECTOR Result = _mm_mul_ps( Length, T ); + return _mm_add_ps( Result, V0 ); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorHermite +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + GXMVECTOR Tangent1, + float t +) +{ + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + +#if defined(_XM_NO_INTRINSICS_) + + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t); + XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = XMVectorReplicate(t3 - t2); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + float p0 = 2.0f * t3 - 3.0f * t2 + 1.0f; + float t0 = t3 - 2.0f * t2 + t; + float p1 = -2.0f * t3 + 3.0f * t2; + float t1 = t3 - t2; + + XMVECTOR vResult = vmulq_n_f32(Position0, p0 ); + vResult = vmlaq_n_f32( vResult, Tangent0, t0 ); + vResult = vmlaq_n_f32( vResult, Position1, p1 ); + vResult = vmlaq_n_f32( vResult, Tangent1, t1 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t); + XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = _mm_set_ps1(t3 - t2); + + XMVECTOR vResult = _mm_mul_ps(P0, Position0); + XMVECTOR vTemp = _mm_mul_ps(T0, Tangent0); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = _mm_mul_ps(P1, Position1); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = _mm_mul_ps(T1, Tangent1); + vResult = _mm_add_ps(vResult,vTemp); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorHermiteV +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + GXMVECTOR Tangent1, + HXMVECTOR T +) +{ + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR T2 = XMVectorMultiply(T, T); + XMVECTOR T3 = XMVectorMultiply(T , T2); + + XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f); + XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]); + XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]); + XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f}; + static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f}; + + XMVECTOR T2 = vmulq_f32(T,T); + XMVECTOR T3 = vmulq_f32(T,T2); + // Mul by the constants against t^2 + T2 = vmulq_f32(T2,CatMulT2); + // Mul by the constants against t^3 + T3 = vmlaq_f32(T2, T3, CatMulT3 ); + // T3 now has the pre-result. + // I need to add t.y only + T2 = vandq_u32(T,g_XMMaskY); + T3 = vaddq_f32(T3,T2); + // Add 1.0f to x + T3 = vaddq_f32(T3,g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = vmulq_lane_f32( Position0, vget_low_f32( T3 ), 0 ); // T3[0] + // Mul the y constant to Tangent0 + vResult = vmlaq_lane_f32(vResult, Tangent0, vget_low_f32( T3 ), 1 ); // T3[1] + // Mul the z constant to Position1 + vResult = vmlaq_lane_f32(vResult, Position1, vget_high_f32( T3 ), 0 ); // T3[2] + // Mul the w constant to Tangent1 + vResult = vmlaq_lane_f32(vResult, Tangent1, vget_high_f32( T3 ), 1 ); // T3[3] + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f}; + static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f}; + + XMVECTOR T2 = _mm_mul_ps(T,T); + XMVECTOR T3 = _mm_mul_ps(T,T2); + // Mul by the constants against t^2 + T2 = _mm_mul_ps(T2,CatMulT2); + // Mul by the constants against t^3 + T3 = _mm_mul_ps(T3,CatMulT3); + // T3 now has the pre-result. + T3 = _mm_add_ps(T3,T2); + // I need to add t.y only + T2 = _mm_and_ps(T,g_XMMaskY); + T3 = _mm_add_ps(T3,T2); + // Add 1.0f to x + T3 = _mm_add_ps(T3,g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = XM_PERMUTE_PS(T3,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,Position0); + // Mul the y constant to Tangent0 + T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(1,1,1,1)); + T2 = _mm_mul_ps(T2,Tangent0); + vResult = _mm_add_ps(vResult,T2); + // Mul the z constant to Position1 + T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(2,2,2,2)); + T2 = _mm_mul_ps(T2,Position1); + vResult = _mm_add_ps(vResult,T2); + // Mul the w constant to Tangent1 + T3 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(3,3,3,3)); + T3 = _mm_mul_ps(T3,Tangent1); + vResult = _mm_add_ps(vResult,T3); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCatmullRom +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR Position3, + float t +) +{ + // Result = ((-t^3 + 2 * t^2 - t) * Position0 + + // (3 * t^3 - 5 * t^2 + 2) * Position1 + + // (-3 * t^3 + 4 * t^2 + t) * Position2 + + // (t^3 - t^2) * Position3) * 0.5 + +#if defined(_XM_NO_INTRINSICS_) + + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(P2, Position2, Result); + Result = XMVectorMultiplyAdd(P3, Position3, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + float p0 = (-t3 + 2.0f * t2 - t) * 0.5f; + float p1 = (3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f; + float p2 = (-3.0f * t3 + 4.0f * t2 + t) * 0.5f; + float p3 = (t3 - t2) * 0.5f; + + XMVECTOR P1 = vmulq_n_f32(Position1, p1); + XMVECTOR P0 = vmlaq_n_f32(P1, Position0, p0); + XMVECTOR P3 = vmulq_n_f32(Position3, p3); + XMVECTOR P2 = vmlaq_n_f32(P3, Position2, p2); + P0 = vaddq_f32(P0,P2); + return P0; +#elif defined(_XM_SSE_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f); + + P0 = _mm_mul_ps(P0, Position0); + P1 = _mm_mul_ps(P1, Position1); + P2 = _mm_mul_ps(P2, Position2); + P3 = _mm_mul_ps(P3, Position3); + P0 = _mm_add_ps(P0,P1); + P2 = _mm_add_ps(P2,P3); + P0 = _mm_add_ps(P0,P2); + return P0; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorCatmullRomV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR Position3, + HXMVECTOR T +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fx = T.vector4_f32[0]; + float fy = T.vector4_f32[1]; + float fz = T.vector4_f32[2]; + float fw = T.vector4_f32[3]; + XMVECTOR vResult; + vResult.vector4_f32[0] = 0.5f*((-fx*fx*fx+2*fx*fx-fx)*Position0.vector4_f32[0] + + (3*fx*fx*fx-5*fx*fx+2)*Position1.vector4_f32[0] + + (-3*fx*fx*fx+4*fx*fx+fx)*Position2.vector4_f32[0] + + (fx*fx*fx-fx*fx)*Position3.vector4_f32[0]); + vResult.vector4_f32[1] = 0.5f*((-fy*fy*fy+2*fy*fy-fy)*Position0.vector4_f32[1] + + (3*fy*fy*fy-5*fy*fy+2)*Position1.vector4_f32[1] + + (-3*fy*fy*fy+4*fy*fy+fy)*Position2.vector4_f32[1] + + (fy*fy*fy-fy*fy)*Position3.vector4_f32[1]); + vResult.vector4_f32[2] = 0.5f*((-fz*fz*fz+2*fz*fz-fz)*Position0.vector4_f32[2] + + (3*fz*fz*fz-5*fz*fz+2)*Position1.vector4_f32[2] + + (-3*fz*fz*fz+4*fz*fz+fz)*Position2.vector4_f32[2] + + (fz*fz*fz-fz*fz)*Position3.vector4_f32[2]); + vResult.vector4_f32[3] = 0.5f*((-fw*fw*fw+2*fw*fw-fw)*Position0.vector4_f32[3] + + (3*fw*fw*fw-5*fw*fw+2)*Position1.vector4_f32[3] + + (-3*fw*fw*fw+4*fw*fw+fw)*Position2.vector4_f32[3] + + (fw*fw*fw-fw*fw)*Position3.vector4_f32[3]); + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f}; + static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f}; + static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f}; + static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f}; + // Cache T^2 and T^3 + XMVECTOR T2 = vmulq_f32(T,T); + XMVECTOR T3 = vmulq_f32(T,T2); + // Perform the Position0 term + XMVECTOR vResult = vaddq_f32(T2,T2); + vResult = vsubq_f32(vResult,T); + vResult = vsubq_f32(vResult,T3); + vResult = vmulq_f32(vResult,Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = vmulq_f32(T3,Catmul3); + vTemp = vmlsq_f32(vTemp, T2, Catmul5); + vTemp = vaddq_f32(vTemp,Catmul2); + vResult = vmlaq_f32(vResult, vTemp, Position1); + // Perform the Position2 term and add + vTemp = vmulq_f32(T2,Catmul4); + vTemp = vmlsq_f32(vTemp, T3, Catmul3); + vTemp = vaddq_f32(vTemp,T); + vResult = vmlaq_f32(vResult, vTemp, Position2); + // Position3 is the last term + T3 = vsubq_f32(T3,T2); + vResult = vmlaq_f32(vResult, T3, Position3); + // Multiply by 0.5f and exit + vResult = vmulq_f32(vResult,g_XMOneHalf); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f}; + static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f}; + static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f}; + static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f}; + // Cache T^2 and T^3 + XMVECTOR T2 = _mm_mul_ps(T,T); + XMVECTOR T3 = _mm_mul_ps(T,T2); + // Perform the Position0 term + XMVECTOR vResult = _mm_add_ps(T2,T2); + vResult = _mm_sub_ps(vResult,T); + vResult = _mm_sub_ps(vResult,T3); + vResult = _mm_mul_ps(vResult,Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = _mm_mul_ps(T3,Catmul3); + XMVECTOR vTemp2 = _mm_mul_ps(T2,Catmul5); + vTemp = _mm_sub_ps(vTemp,vTemp2); + vTemp = _mm_add_ps(vTemp,Catmul2); + vTemp = _mm_mul_ps(vTemp,Position1); + vResult = _mm_add_ps(vResult,vTemp); + // Perform the Position2 term and add + vTemp = _mm_mul_ps(T2,Catmul4); + vTemp2 = _mm_mul_ps(T3,Catmul3); + vTemp = _mm_sub_ps(vTemp,vTemp2); + vTemp = _mm_add_ps(vTemp,T); + vTemp = _mm_mul_ps(vTemp,Position2); + vResult = _mm_add_ps(vResult,vTemp); + // Position3 is the last term + T3 = _mm_sub_ps(T3,T2); + T3 = _mm_mul_ps(T3,Position3); + vResult = _mm_add_ps(vResult,T3); + // Multiply by 0.5f and exit + vResult = _mm_mul_ps(vResult,g_XMOneHalf); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorBaryCentric +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + float f, + float g +) +{ + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P10 = XMVectorSubtract(Position1, Position0); + XMVECTOR ScaleF = XMVectorReplicate(f); + + XMVECTOR P20 = XMVectorSubtract(Position2, Position0); + XMVECTOR ScaleG = XMVectorReplicate(g); + + XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0); + Result = XMVectorMultiplyAdd(P20, ScaleG, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR R1 = vsubq_f32(Position1,Position0); + XMVECTOR R2 = vsubq_f32(Position2,Position0); + R1 = vmlaq_n_f32( Position0, R1, f); + return vmlaq_n_f32( R1, R2, g ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1,Position0); + XMVECTOR SF = _mm_set_ps1(f); + XMVECTOR R2 = _mm_sub_ps(Position2,Position0); + XMVECTOR SG = _mm_set_ps1(g); + R1 = _mm_mul_ps(R1,SF); + R2 = _mm_mul_ps(R2,SG); + R1 = _mm_add_ps(R1,Position0); + R1 = _mm_add_ps(R1,R2); + return R1; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVectorBaryCentricV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR F, + HXMVECTOR G +) +{ + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P10 = XMVectorSubtract(Position1, Position0); + XMVECTOR P20 = XMVectorSubtract(Position2, Position0); + + XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0); + Result = XMVectorMultiplyAdd(P20, G, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR R1 = vsubq_f32(Position1,Position0); + XMVECTOR R2 = vsubq_f32(Position2,Position0); + R1 = vmlaq_f32( Position0, R1, F ); + return vmlaq_f32( R1, R2, G); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1,Position0); + XMVECTOR R2 = _mm_sub_ps(Position2,Position0); + R1 = _mm_mul_ps(R1,F); + R2 = _mm_mul_ps(R2,G); + R1 = _mm_add_ps(R1,Position0); + R1 = _mm_add_ps(R1,R2); + return R1; +#endif +} + +/**************************************************************************** + * + * 2D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); +// z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#endif +} + + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); +// z and w are don't care + int iTest = _mm_movemask_ps(vTemp)&3; + uint32_t CR = 0; + if (iTest==3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)==3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&3; + uint32_t CR = 0; + if (iTest==3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + float dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); + float dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); + return ((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t vDelta = vsub_f32(vget_low_u32(V1), vget_low_u32(V2)); + uint32x2_t vTemp = vacle_f32( vDelta, vget_low_u32(Epsilon) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + return ( r == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + // z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)==0x3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); +// z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)!=3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)!=3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); +// z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp)&3; + uint32_t CR = 0; + if (iTest==3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector2GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp)&3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vclt_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vTemp = vcle_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + float32x2_t B = vget_low_f32( Bounds ); + // Test if less than or equal + uint32x2_t ivTemp1 = vcle_f32(VL,B); + // Negate the bounds + float32x2_t vTemp2 = vneg_f32(B); + // Test if greater or equal (Reversed) + uint32x2_t ivTemp2 = vcle_f32(vTemp2,VL); + // Blend answers + ivTemp1 = vand_u32(ivTemp1,ivTemp2); + // x and y in bounds? + return ( vget_lane_u64( ivTemp1, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // x and y in bounds? (z and w are don't care) + return (((_mm_movemask_ps(vTemp1)&0x3)==0x3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2IsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + // Test against itself. NaN is always not equal + uint32x2_t vTempNan = vceq_f32( VL, VL ); + // If x or y are NaN, the mask is zero + return ( vget_lane_u64( vTempNan, 0 ) != 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); + // If x or y are NaN, the mask is non-zero + return ((_mm_movemask_ps(vTempNan)&3) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector2IsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x2_t vTemp = vand_u32( vget_low_f32( V ) , vget_low_f32( g_XMAbsMask ) ); + // Compare to infinity + vTemp = vceq_f32(vTemp, vget_low_f32( g_XMInfinity) ); + // If any are infinity, the signs are true. + return vget_lane_u64( vTemp, 0 ) != 0; +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If x or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp)&3) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = + Result.vector4_f32[1] = + Result.vector4_f32[2] = + Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Perform the dot product on x and y + float32x2_t vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) ); + vTemp = vpadd_f32( vTemp, vTemp ); + return vcombine_f32( vTemp, vTemp ); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps( V1, V2, 0x3f ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V1, V2); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_moveldup_ps(vDot); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V1,V2); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ] + +#if defined(_XM_NO_INTRINSICS_) + float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]); + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = fCross; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { 1.f, -1.f, 0, 0 }; + + float32x2_t vTemp = vmul_f32( vget_low_f32( V1 ), vrev64_f32( vget_low_f32( V2 ) ) ); + vTemp = vmul_f32( vTemp, vget_low_f32( Negate ) ); + vTemp = vpadd_f32( vTemp, vTemp ); + return vcombine_f32( vTemp, vTemp ); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap x and y + XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(0,1,0,1)); + // Perform the muls + vResult = _mm_mul_ps(vResult,V1); + // Splat y + XMVECTOR vTemp = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1)); + // Sub the values + vResult = _mm_sub_ss(vResult,vTemp); + // Splat the cross product + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,0,0,0)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LengthSq +( + FXMVECTOR V +) +{ + return XMVector2Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + // Reciprocal sqrt (estimate) + vTemp = vrsqrte_f32( vTemp ); + return vcombine_f32( vTemp, vTemp ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + return _mm_rsqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32( vTemp, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( vTemp, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t Result = vmul_f32( S1, R1 ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); + return _mm_div_ps( g_XMOne, vLengthSq ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = _mm_div_ss(g_XMOne, vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = _mm_div_ss(g_XMOne,vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrtEst(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32( vTemp, zero ); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32( vTemp ); + Result = vmul_f32( vTemp, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + return _mm_sqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Length +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrt(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32( vTemp, zero ); + // Sqrt + float32x2_t S0 = vrsqrte_f32( vTemp ); + float32x2_t P0 = vmul_f32( vTemp, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( vTemp, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t Result = vmul_f32( S1, R1 ); + Result = vmul_f32( vTemp, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + return _mm_sqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ss(vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector2NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + // Reciprocal sqrt (estimate) + vTemp = vrsqrte_f32( vTemp ); + // Normalize + float32x2_t Result = vmul_f32( VL, vTemp ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0)); + vLengthSq = _mm_mul_ps(vLengthSq, V); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + vLengthSq = _mm_mul_ps(vLengthSq,V); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Normalize +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR vResult = XMVector2Length( V ); + float fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f/fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; + vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; + vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; + vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32(V); + // Dot2 + float32x2_t vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + uint32x2_t VEqualsZero = vceq_f32( vTemp, vdup_n_f32(0) ); + uint32x2_t VEqualsInf = vceq_f32( vTemp, vget_low_f32(g_XMInfinity) ); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32( vTemp ); + float32x2_t P0 = vmul_f32( vTemp, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( vTemp, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + vTemp = vmul_f32( S1, R1 ); + // Normalize + float32x2_t Result = vmul_f32( VL, vTemp ); + Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result ); + Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x and y only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_moveldup_ps(vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V, vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult, vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq); + vResult = _mm_or_ps(vTemp1, vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y only + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + return XMVector2ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero)); + assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero)); + assert(XMVector2GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector2LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result; + Result = XMVector2Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector2RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +// Return the refraction of a 2D vector +inline XMVECTOR XM_CALLCONV XMVector2RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) +{ + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + +#if defined(_XM_NO_INTRINSICS_) + + float IDotN = (Incident.vector4_f32[0]*Normal.vector4_f32[0])+(Incident.vector4_f32[1]*Normal.vector4_f32[1]); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float RY = 1.0f-(IDotN*IDotN); + float RX = 1.0f-(RY*RefractionIndex.vector4_f32[0]*RefractionIndex.vector4_f32[0]); + RY = 1.0f-(RY*RefractionIndex.vector4_f32[1]*RefractionIndex.vector4_f32[1]); + if (RX>=0.0f) { + RX = (RefractionIndex.vector4_f32[0]*Incident.vector4_f32[0])-(Normal.vector4_f32[0]*((RefractionIndex.vector4_f32[0]*IDotN)+sqrtf(RX))); + } else { + RX = 0.0f; + } + if (RY>=0.0f) { + RY = (RefractionIndex.vector4_f32[1]*Incident.vector4_f32[1])-(Normal.vector4_f32[1]*((RefractionIndex.vector4_f32[1]*IDotN)+sqrtf(RY))); + } else { + RY = 0.0f; + } + + XMVECTOR vResult; + vResult.vector4_f32[0] = RX; + vResult.vector4_f32[1] = RY; + vResult.vector4_f32[2] = 0.0f; + vResult.vector4_f32[3] = 0.0f; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t IL = vget_low_f32( Incident ); + float32x2_t NL = vget_low_f32( Normal ); + float32x2_t RIL = vget_low_f32( RefractionIndex ); + // Get the 2D Dot product of Incident-Normal + float32x2_t vTemp = vmul_f32(IL, NL); + float32x2_t IDotN = vpadd_f32( vTemp, vTemp ); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + vTemp = vmls_f32( vget_low_f32( g_XMOne ), IDotN, IDotN); + vTemp = vmul_f32(vTemp,RIL); + vTemp = vmls_f32(vget_low_f32( g_XMOne ), vTemp, RIL ); + // If any terms are <=0, sqrt() will fail, punt to zero + uint32x2_t vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero) ); + // Sqrt(vTemp) + float32x2_t S0 = vrsqrte_f32(vTemp); + float32x2_t P0 = vmul_f32( vTemp, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( vTemp, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t S2 = vmul_f32( S1, R1 ); + vTemp = vmul_f32( vTemp, S2 ); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = vmla_f32( vTemp, RIL, IDotN ); + // Result = RefractionIndex * Incident - Normal * R + float32x2_t vResult = vmul_f32(RIL,IL); + vResult = vmls_f32( vResult, vTemp, NL ); + vResult = vand_u32(vResult,vMask); + return vcombine_f32(vResult, vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + // Get the 2D Dot product of Incident-Normal + XMVECTOR IDotN = XMVector2Dot(Incident, Normal); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR vTemp = _mm_mul_ps(IDotN,IDotN); + vTemp = _mm_sub_ps(g_XMOne,vTemp); + vTemp = _mm_mul_ps(vTemp,RefractionIndex); + vTemp = _mm_mul_ps(vTemp,RefractionIndex); + vTemp = _mm_sub_ps(g_XMOne,vTemp); + // If any terms are <=0, sqrt() will fail, punt to zero + XMVECTOR vMask = _mm_cmpgt_ps(vTemp,g_XMZero); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = _mm_sqrt_ps(vTemp); + XMVECTOR vResult = _mm_mul_ps(RefractionIndex,IDotN); + vTemp = _mm_add_ps(vTemp,vResult); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex,Incident); + vTemp = _mm_mul_ps(vTemp,Normal); + vResult = _mm_sub_ps(vResult,vTemp); + vResult = _mm_and_ps(vResult,vMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Orthogonal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = -V.vector4_f32[1]; + Result.vector4_f32[1] = V.vector4_f32[0]; + Result.vector4_f32[2] = 0.f; + Result.vector4_f32[3] = 0.f; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { -1.f, 1.f, 0, 0 }; + const float32x2_t zero = vdup_n_f32(0); + + float32x2_t VL = vget_low_f32( V ); + float32x2_t Result = vmul_f32( vrev64_f32( VL ), vget_low_f32( Negate ) ); + return vcombine_f32( Result, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + vResult = _mm_mul_ps(vResult,g_XMNegateX); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ + XMVECTOR Result = XMVector2Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ + XMVECTOR Result = XMVector2Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + XMVECTOR L1 = XMVector2ReciprocalLength(V1); + XMVECTOR L2 = XMVector2ReciprocalLength(V2); + + XMVECTOR Dot = XMVector2Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) +{ + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + + XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); + XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + XMVECTOR LengthSq = XMVector2LengthSq(LineVector); + + XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector); + PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); + + XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + return XMVector2Length(DistanceVector); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2IntersectLine +( + FXMVECTOR Line1Point1, + FXMVECTOR Line1Point2, + FXMVECTOR Line2Point1, + GXMVECTOR Line2Point2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1); + XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1); + XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1); + + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + + XMVECTOR Result; + const XMVECTOR Zero = XMVectorZero(); + if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v)) + { + if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v)) + { + // Coincident + Result = g_XMInfinity.v; + } + else + { + // Parallel + Result = g_XMQNaN.v; + } + } + else + { + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR Scale = XMVectorReciprocal(C1); + Scale = XMVectorMultiply(C2, Scale); + Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1); + } + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1); + XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1); + XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1); + // Generate the cross products + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + // If C1 is not close to epsilon, use the calculated value + XMVECTOR vResultMask = _mm_setzero_ps(); + vResultMask = _mm_sub_ps(vResultMask,C1); + vResultMask = _mm_max_ps(vResultMask,C1); + // 0xFFFFFFFF if the calculated value is to be used + vResultMask = _mm_cmpgt_ps(vResultMask,g_XMEpsilon); + // If C1 is close to epsilon, which fail type is it? INFINITY or NAN? + XMVECTOR vFailMask = _mm_setzero_ps(); + vFailMask = _mm_sub_ps(vFailMask,C2); + vFailMask = _mm_max_ps(vFailMask,C2); + vFailMask = _mm_cmple_ps(vFailMask,g_XMEpsilon); + XMVECTOR vFail = _mm_and_ps(vFailMask,g_XMInfinity); + vFailMask = _mm_andnot_ps(vFailMask,g_XMQNaN); + // vFail is NAN or INF + vFail = _mm_or_ps(vFail,vFailMask); + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR vResult = _mm_div_ps(C2,C1); + vResult = _mm_mul_ps(vResult,V1); + vResult = _mm_add_ps(vResult,Line1Point1); + // Use result, or failure value + vResult = _mm_and_ps(vResult,vResultMask); + vResultMask = _mm_andnot_ps(vResultMask,vFail); + vResult = _mm_or_ps(vResult,vResultMask); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2Transform +( + FXMVECTOR V, + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + float32x4_t Result = vmlaq_lane_f32( M.r[3], M.r[1], VL, 1 ); // Y + return vmlaq_lane_f32( Result, M.r[0], VL, 0 ); // X +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vResult = _mm_add_ps(vResult,M.r[3]); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); + Result = XMVectorMultiplyAdd(X, row0, Result); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*4; + + float32x2_t r3 = vget_low_f32( row3 ); + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N + + __prefetch( pInputVector ); + + r3 = vget_high_f32( row3 ); + r = vget_high_f32( row0 ); + XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O + XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( row1 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O + vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + float32x4x4_t R; + R.val[0] = vResult0; + R.val[1] = vResult1; + R.val[2] = vResult2; + R.val[3] = vResult3; + + vst4q_f32( reinterpret_cast(pOutputVector), R ); + pOutputVector += sizeof(XMFLOAT4)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32( row3, row0, V, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y + + vst1q_f32( reinterpret_cast(pOutputVector), vResult ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if ( two > 0 ) + { + if ( InputStride == sizeof(XMFLOAT2) ) + { + if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) ) + { + // Packed input, aligned output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + X = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + i += 2; + } + } + else + { + // Packed input, unaligned output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + X = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) ) + { + if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) ) + { + // Aligned input, aligned output + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast(pInputVector) ) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + } + } + else + { + // Aligned input, unaligned output + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast(pInputVector) ) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + } + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 x = _mm_load_ss( reinterpret_cast(pInputVector) ); + __m128 y = _mm_load_ss( reinterpret_cast(pInputVector+4) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS(y,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR X = XM_PERMUTE_PS(x,_MM_SHUFFLE(0,0,0,0)); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2TransformCoord +( + FXMVECTOR V, + FXMMATRIX M +) +{ + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMVECTOR W = XMVectorSplatW(Result); + return XMVectorDivide( Result, W ); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2)); + +#if defined(_XM_NO_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMVECTOR W = XMVectorSplatW(Result); + + Result = XMVectorDivide(Result, W); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat2((XMFLOAT2*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*4; + + float32x2_t r3 = vget_low_f32( row3 ); + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N + + __prefetch( pInputVector ); + + r3 = vget_high_f32( row3 ); + r = vget_high_f32( row0 ); + XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( row1 ); + W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + V.val[0] = vmulq_f32( vResult0, Reciprocal ); + V.val[1] = vmulq_f32( vResult1, Reciprocal ); + + vst2q_f32( reinterpret_cast(pOutputVector),V ); + pOutputVector += sizeof(XMFLOAT2)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32( row3, row0, V, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y + + V = vget_high_f32( vResult ); + float32x2_t W = vdup_lane_f32( V, 1 ); + + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x2_t Reciprocal = vrecpe_f32( W ); + float32x2_t S = vrecps_f32( Reciprocal, W ); + Reciprocal = vmul_f32( S, Reciprocal ); + S = vrecps_f32( Reciprocal, W ); + Reciprocal = vmul_f32( S, Reciprocal ); + + V = vget_low_f32( vResult ); + V = vmul_f32( V, Reciprocal ); + + vst1_f32( reinterpret_cast(pOutputVector), V ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if ( two > 0 ) + { + if ( InputStride == sizeof(XMFLOAT2) ) + { + if ( OutputStride == sizeof(XMFLOAT2) ) + { + if ( !((uintptr_t)pOutputStream & 0xF) ) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + XMVECTOR V1 = _mm_div_ps( vTemp, W ); + + // Result 2 + Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); + X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + XMVECTOR V2 = _mm_div_ps( vTemp, W ); + + vTemp = _mm_movelh_ps( V1, V2 ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += sizeof(XMFLOAT2)*2; + + i += 2; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + XMVECTOR V1 = _mm_div_ps( vTemp, W ); + + // Result 2 + Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); + X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + XMVECTOR V2 = _mm_div_ps( vTemp, W ); + + vTemp = _mm_movelh_ps( V1, V2 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += sizeof(XMFLOAT2)*2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + + // Result 2 + Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); + X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) ) + { + // Aligned input + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast(pInputVector) ) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 x = _mm_load_ss( reinterpret_cast(pInputVector) ); + __m128 y = _mm_load_ss( reinterpret_cast(pInputVector+4) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) ); + XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector2TransformNormal +( + FXMVECTOR V, + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Y, M.r[1]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + float32x4_t Result = vmulq_lane_f32( M.r[1], VL, 1 ); // Y + return vmlaq_lane_f32( Result, M.r[0], VL, 0 ); // X +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT2)); + + assert(OutputStride >= sizeof(XMFLOAT2)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2)); + +#if defined(_XM_NO_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Y, row1); + Result = XMVectorMultiplyAdd(X, row0, Result); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat2((XMFLOAT2*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x2_t V = vld2q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*4; + + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax + XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx + + __prefetch( pInputVector ); + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + V.val[0] = vResult0; + V.val[1] = vResult1; + + vst2q_f32( reinterpret_cast(pOutputVector), V ); + pOutputVector += sizeof(XMFLOAT2)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t V = vld1_f32( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vResult = vmulq_lane_f32( row0, V, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y + + V = vget_low_f32( vResult ); + vst1_f32( reinterpret_cast(pOutputVector), V ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + size_t i = 0; + size_t two = VectorCount >> 1; + if ( two > 0 ) + { + if ( InputStride == sizeof(XMFLOAT2) ) + { + if ( OutputStride == sizeof(XMFLOAT2) ) + { + if ( !((uintptr_t)pOutputStream & 0xF) ) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + XMVECTOR V1 = _mm_add_ps( vTemp, vTemp2 ); + + // Result 2 + Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); + X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + XMVECTOR V2 = _mm_add_ps( vTemp, vTemp2 ); + + vTemp = _mm_movelh_ps( V1, V2 ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += sizeof(XMFLOAT2)*2; + + i += 2; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + XMVECTOR V1 = _mm_add_ps( vTemp, vTemp2 ); + + // Result 2 + Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); + X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + XMVECTOR V2 = _mm_add_ps( vTemp, vTemp2 ); + + vTemp = _mm_movelh_ps( V1, V2 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += sizeof(XMFLOAT2)*2; + + i += 2; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < two; ++j) + { + XMVECTOR V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT2)*2; + + // Result 1 + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + + // Result 2 + Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); + X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + + vTemp = _mm_mul_ps( Y, row1 ); + vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + + i += 2; + } + } + } + } + + if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) ) + { + // Aligned input + for (; i < VectorCount; i++) + { + XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast(pInputVector) ) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input + for (; i < VectorCount; i++) + { + __m128 x = _mm_load_ss( reinterpret_cast(pInputVector) ); + __m128 y = _mm_load_ss( reinterpret_cast(pInputVector+4) ); + pInputVector += InputStride; + + XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) ); + XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp2 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) ); + + _mm_store_ss( reinterpret_cast(pOutputVector), vTemp ); + _mm_store_ss( reinterpret_cast(pOutputVector+4), vTemp2 ); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +/**************************************************************************** + * + * 3D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if ( r == 0xFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp)&7; + uint32_t CR = 0; + if (iTest==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)==7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1]) && + (V1.vector4_u32[2] == V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1]) && + (V1.vector4_u32[2] != V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if ( r == 0xFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&7; + uint32_t CR = 0; + if (iTemp==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + float dx, dy, dz; + + dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32( V1, V2 ); + uint32x4_t vResult = vacleq_f32( vDelta, Epsilon ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + // w is don't care + return (((_mm_movemask_ps(vTemp)&7)==0x7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)!=7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)!=7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1]) && + (V1.vector4_f32[2] > V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1]) && + (V1.vector4_f32[2] <= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if ( r == 0xFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp)&7; + if (iTest==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector3GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if ( r == 0xFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp)&7; + if (iTest==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcltq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcleq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t ivTemp1 = vcleq_f32(V,Bounds); + // Negate the bounds + float32x4_t vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + uint32x4_t ivTemp2 = vcleq_f32(vTemp2,V); + // Blend answers + ivTemp1 = vandq_u32(ivTemp1,ivTemp2); + // in bounds? + int8x8x2_t vTemp = vzip_u8(vget_low_u8(ivTemp1), vget_high_u8(ivTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // x,y and z in bounds? (w is don't care) + return (((_mm_movemask_ps(vTemp1)&0x7)==0x7) != 0); +#else + return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3IsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2])); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32( V, V ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + // If x or y or z are NaN, the mask is zero + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); + // If x or y or z are NaN, the mask is non-zero + return ((_mm_movemask_ps(vTempNan)&7) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector3IsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTempInf = vandq_u32( V, g_XMAbsMask ); + // Compare to infinity + vTempInf = vceqq_f32(vTempInf, g_XMInfinity ); + // If any are infinity, the signs are true. + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If x,y or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp)&7) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2]; + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = fValue; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTemp = vmulq_f32( V1, V2 ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + return vcombine_f32( v1, v1 ); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps( V1, V2, 0x7f ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_mul_ps(V1,V2); + vTemp = _mm_and_ps(vTemp, g_XMMask3); + vTemp = _mm_hadd_ps(vTemp,vTemp); + return _mm_hadd_ps(vTemp,vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V1,V2); + // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2] + XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.vector4_f32[0] = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.vector4_f32[2] + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.vector4_f32[0] = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = { + (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]), + (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]), + (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]), + 0.0f + }; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t v1xy = vget_low_f32(V1); + float32x2_t v2xy = vget_low_f32(V2); + + float32x2_t v1yx = vrev64_f32( v1xy ); + float32x2_t v2yx = vrev64_f32( v2xy ); + + float32x2_t v1zz = vdup_lane_f32( vget_high_f32(V1), 0 ); + float32x2_t v2zz = vdup_lane_f32( vget_high_f32(V2), 0 ); + + XMVECTOR vResult = vmulq_f32( vcombine_f32(v1yx,v1xy), vcombine_f32(v2zz,v2yx) ); + vResult = vmlsq_f32( vResult, vcombine_f32(v1zz,v1yx), vcombine_f32(v2yx,v2xy) ); + vResult = veorq_u32( vResult, g_XMFlipY ); + return vandq_u32( vResult, g_XMMask3 ); +#elif defined(_XM_SSE_INTRINSICS_) + // y1,z1,x1,w1 + XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(3,0,2,1)); + // z2,x2,y2,w2 + XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(3,1,0,2)); + // Perform the left operation + XMVECTOR vResult = _mm_mul_ps(vTemp1,vTemp2); + // z1,x1,y1,w1 + vTemp1 = XM_PERMUTE_PS(vTemp1,_MM_SHUFFLE(3,0,2,1)); + // y2,z2,x2,w2 + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(3,1,0,2)); + // Perform the right operation + vTemp1 = _mm_mul_ps(vTemp1,vTemp2); + // Subract the right from left, and return answer + vResult = _mm_sub_ps(vResult,vTemp1); + // Set w to zero + return _mm_and_ps(vResult,g_XMMask3); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LengthSq +( + FXMVECTOR V +) +{ + return XMVector3Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32( v1 ); + return vcombine_f32(v2, v2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + return _mm_rsqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32( v1, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( v1, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t Result = vmul_f32( S1, R1 ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); + return _mm_div_ps( g_XMOne, vLengthSq ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_and_ps(vDot, g_XMMask3); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_sqrt_ps(vDot); + vDot = _mm_div_ps(g_XMOne,vDot); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V,V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vDot = _mm_sqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_div_ps(g_XMOne,vDot); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32( v1, zero ); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32( v1 ); + Result = vmul_f32( v1, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + return _mm_sqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Length +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32( v1, zero ); + // Sqrt + float32x2_t S0 = vrsqrte_f32( v1 ); + float32x2_t P0 = vmul_f32( v1, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( v1, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t Result = vmul_f32( S1, R1 ); + Result = vmul_f32( v1, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + return _mm_sqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector3NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector3ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32( v1 ); + // Normalize + return vmulq_f32( V, vcombine_f32(v2,v2) ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_and_ps(vDot, g_XMMask3); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_rsqrt_ps(vDot); + vDot = _mm_mul_ps(vDot,V); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V,V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Perform the normalization + vDot = _mm_mul_ps(vDot,V); + return vDot; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Normalize +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fLength; + XMVECTOR vResult; + + vResult = XMVector3Length( V ); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f/fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; + vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; + vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; + vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) ); + uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) ); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32( v1 ); + float32x2_t P0 = vmul_f32( v1, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( v1, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + v2 = vmul_f32( S1, R1 ); + // Normalize + XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) ); + vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult ); + return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + + return XMVector3ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero())); + assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero())); + assert(XMVector3GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector3LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result = XMVector3Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector3RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) +{ + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR IDotN = XMVector3Dot(Incident,Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float32x4_t R = vmlsq_f32( g_XMOne, IDotN, IDotN); + R = vmulq_f32(R, RefractionIndex); + R = vmlsq_f32(g_XMOne, R, RefractionIndex ); + + uint32x4_t vResult = vcleq_f32(R,g_XMZero); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // Sqrt(R) + float32x4_t S0 = vrsqrteq_f32(R); + float32x4_t P0 = vmulq_f32( R, S0 ); + float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); + float32x4_t S1 = vmulq_f32( S0, R0 ); + float32x4_t P1 = vmulq_f32( R, S1 ); + float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); + float32x4_t S2 = vmulq_f32( S1, R1 ); + R = vmulq_f32( R, S2 ); + // R = RefractionIndex * IDotN + sqrt(R) + R = vmlaq_f32( R, RefractionIndex, IDotN ); + // Result = RefractionIndex * Incident - Normal * R + vResult = vmulq_f32(RefractionIndex, Incident); + vResult = vmlsq_f32( vResult, R, Normal ); + } + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = _mm_mul_ps(IDotN, IDotN); + R = _mm_sub_ps(g_XMOne,R); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_sub_ps(g_XMOne,R); + + XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero); + if (_mm_movemask_ps(vResult)==0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + vResult = _mm_mul_ps(RefractionIndex,IDotN); + R = _mm_add_ps(R,vResult); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + R = _mm_mul_ps(R,Normal); + vResult = _mm_sub_ps(vResult,R); + } + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Orthogonal +( + FXMVECTOR V +) +{ + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR YZYY = XMVectorSwizzle(V); + + XMVECTOR NegativeV = XMVectorSubtract(Zero, V); + + XMVECTOR ZIsNegative = XMVectorLess(Z, Zero); + XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero); + + XMVECTOR S = XMVectorAdd(YZYY, Z); + XMVECTOR D = XMVectorSubtract(YZYY, Z); + + XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative); + + XMVECTOR R0 = XMVectorPermute(NegativeV, S); + XMVECTOR R1 = XMVectorPermute(V, D); + + return XMVectorSelect(R1, R0, Select); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ + XMVECTOR Result = XMVector3Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ + XMVECTOR Result = XMVector3Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + XMVECTOR L1 = XMVector3ReciprocalLength(V1); + XMVECTOR L2 = XMVector3ReciprocalLength(V2); + + XMVECTOR Dot = XMVector3Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) +{ + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + + XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); + XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + XMVECTOR LengthSq = XMVector3LengthSq(LineVector); + + XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector); + PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); + + XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + return XMVector3Length(DistanceVector); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XM_CALLCONV XMVector3ComponentsFromNormal +( + XMVECTOR* pParallel, + XMVECTOR* pPerpendicular, + FXMVECTOR V, + FXMVECTOR Normal +) +{ + assert(pParallel != nullptr); + assert(pPerpendicular != nullptr); + + XMVECTOR Scale = XMVector3Dot(V, Normal); + + XMVECTOR Parallel = XMVectorMultiply(Normal, Scale); + + *pParallel = Parallel; + *pPerpendicular = XMVectorSubtract(V, Parallel); +} + +//------------------------------------------------------------------------------ +// Transform a vector using a rotation expressed as a unit quaternion + +inline XMVECTOR XM_CALLCONV XMVector3Rotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) +{ + XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); + XMVECTOR Result = XMQuaternionMultiply(Q, A); + return XMQuaternionMultiply(Result, RotationQuaternion); +} + +//------------------------------------------------------------------------------ +// Transform a vector using the inverse of a rotation expressed as a unit quaternion + +inline XMVECTOR XM_CALLCONV XMVector3InverseRotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) +{ + XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A); + XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); + return XMQuaternionMultiply(Result, Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Transform +( + FXMVECTOR V, + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + XMVECTOR vResult = vmlaq_lane_f32( M.r[3], M.r[0], VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y + return vmlaq_lane_f32( vResult, M.r[2], vget_high_f32( V ), 0 ); // Z +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + vTemp = _mm_mul_ps(vTemp,M.r[2]); + vResult = _mm_add_ps(vResult,vTemp); + vResult = _mm_add_ps(vResult,M.r[3]); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT3)*4; + + float32x2_t r3 = vget_low_f32( row3 ); + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N + + __prefetch( pInputVector ); + + r3 = vget_high_f32( row3 ); + r = vget_high_f32( row0 ); + XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O + XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( row1 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O + vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + r = vget_low_f32( row2 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + + r = vget_high_f32( row2 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O + vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + + float32x4x4_t R; + R.val[0] = vResult0; + R.val[1] = vResult1; + R.val[2] = vResult2; + R.val[3] = vResult3; + + vst4q_f32( reinterpret_cast(pOutputVector), R ); + pOutputVector += sizeof(XMFLOAT4)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32( row3, row0, VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, VL, 1); // Y + vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z + + vst1q_f32( reinterpret_cast(pOutputVector), vResult ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) ) + { + // Packed input, aligned output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + i += 4; + } + } + else + { + // Packed input, unaligned output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) ) + { + // Aligned output + for (; i < VectorCount; ++i) + { + #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" ) + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned output + for (; i < VectorCount; ++i) + { + #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" ) + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTemp ); + pOutputVector += OutputStride; + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3TransformCoord +( + FXMVECTOR V, + FXMMATRIX M +) +{ + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMVECTOR W = XMVectorSplatW(Result); + return XMVectorDivide( Result, W ); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMVECTOR W = XMVectorSplatW(Result); + + Result = XMVectorDivide(Result, W); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT3)*4; + + float32x2_t r3 = vget_low_f32( row3 ); + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N + + __prefetch( pInputVector ); + + r3 = vget_high_f32( row3 ); + r = vget_high_f32( row0 ); + XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O + XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( row1 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O + W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + r = vget_low_f32( row2 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + + r = vget_high_f32( row2 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O + W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + V.val[0] = vmulq_f32( vResult0, Reciprocal ); + V.val[1] = vmulq_f32( vResult1, Reciprocal ); + V.val[2] = vmulq_f32( vResult2, Reciprocal ); + + vst3q_f32( reinterpret_cast(pOutputVector),V ); + pOutputVector += sizeof(XMFLOAT3)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32( row3, row0, VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y + vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32( VH, 1 ); + + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32( W ); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + vResult = vmulq_f32( vResult, Reciprocal ); + + VL = vget_low_f32( vResult ); + vst1_f32( reinterpret_cast(pOutputVector), VL ); + vst1q_lane_f32( reinterpret_cast(pOutputVector)+2, vResult, 2 ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if ( !((uintptr_t)pOutputStream & 0xF) ) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V1 = _mm_div_ps( vTemp, W ); + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V2 = _mm_div_ps( vTemp, W ); + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V3 = _mm_div_ps( vTemp, W ); + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V4 = _mm_div_ps( vTemp, W ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS( reinterpret_cast(pOutputVector), V1 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+16), vTemp ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V1 = _mm_div_ps( vTemp, W ); + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V2 = _mm_div_ps( vTemp, W ); + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V3 = _mm_div_ps( vTemp, W ); + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + V4 = _mm_div_ps( vTemp, W ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps( reinterpret_cast(pOutputVector), V1 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+16), vTemp ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" ) + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, row3 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + + vTemp = _mm_div_ps( vTemp, W ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3TransformNormal +( + FXMVECTOR V, + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Z, M.r[2]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y + return vmlaq_lane_f32( vResult, M.r[2], vget_high_f32( V ), 0 ); // Z +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + vTemp = _mm_mul_ps(vTemp,M.r[2]); + vResult = _mm_add_ps(vResult,vTemp); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Z, row2); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT3)*4; + + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax + XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx + + __prefetch( pInputVector ); + + r = vget_high_f32( row0 ); + XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( row1 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + r = vget_low_f32( row2 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz + vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + + r = vget_high_f32( row2 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + + V.val[0] = vResult0; + V.val[1] = vResult1; + V.val[2] = vResult2; + + vst3q_f32( reinterpret_cast(pOutputVector), V ); + pOutputVector += sizeof(XMFLOAT3)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); + pInputVector += InputStride; + + XMVECTOR vResult = vmulq_lane_f32( row0, VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y + vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z + + VL = vget_low_f32( vResult ); + vst1_f32( reinterpret_cast(pOutputVector), VL ); + vst1q_lane_f32( reinterpret_cast(pOutputVector)+2, vResult, 2 ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if ( !((uintptr_t)pOutputStream & 0xF) ) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V1 = _mm_add_ps( vTemp, vTemp3 ); + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V2 = _mm_add_ps( vTemp, vTemp3 ); + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V3 = _mm_add_ps( vTemp, vTemp3 ); + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V4 = _mm_add_ps( vTemp, vTemp3 ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS( reinterpret_cast(pOutputVector), V1 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+16), vTemp ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V1 = _mm_add_ps( vTemp, vTemp3 ); + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V2 = _mm_add_ps( vTemp, vTemp3 ); + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V3 = _mm_add_ps( vTemp, vTemp3 ); + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + V4 = _mm_add_ps( vTemp, vTemp3 ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps( reinterpret_cast(pOutputVector), V1 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+16), vTemp ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, row2 ); + vTemp2 = _mm_mul_ps( Y, row1 ); + vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" ) + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, row2 ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 ); + XMVECTOR vTemp3 = _mm_mul_ps( X, row0 ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Project +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + XMVECTOR Result = XMVector3TransformCoord(V, Transform); + + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + + XMVECTOR Result = XMVector3TransformCoord(V, Transform); + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + XMVECTOR ScaleX = vdupq_n_f32(HalfViewportWidth); + XMVECTOR ScaleY = vdupq_n_f32(-HalfViewportHeight); + XMVECTOR ScaleZ = vdupq_n_f32(ViewportMaxZ - ViewportMinZ); + + XMVECTOR OffsetX = vdupq_n_f32(ViewportX + HalfViewportWidth); + XMVECTOR OffsetY = vdupq_n_f32(ViewportY + HalfViewportHeight); + XMVECTOR OffsetZ = vdupq_n_f32(ViewportMinZ); + + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT3)*4; + + float32x2_t r3 = vget_low_f32( Transform.r[3] ); + float32x2_t r = vget_low_f32( Transform.r[0] ); + XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N + + __prefetch( pInputVector ); + + r3 = vget_high_f32( Transform.r[3] ); + r = vget_high_f32( Transform.r[0] ); + XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O + XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( Transform.r[1] ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( Transform.r[1] ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O + W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + r = vget_low_f32( Transform.r[2] ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + + r = vget_high_f32( Transform.r[2] ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O + W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + vResult0 = vmulq_f32( vResult0, Reciprocal ); + vResult1 = vmulq_f32( vResult1, Reciprocal ); + vResult2 = vmulq_f32( vResult2, Reciprocal ); + + V.val[0] = vmlaq_f32( OffsetX, vResult0, ScaleX ); + V.val[1] = vmlaq_f32( OffsetY, vResult1, ScaleY ); + V.val[2] = vmlaq_f32( OffsetZ, vResult2, ScaleZ ); + + vst3q_f32( reinterpret_cast(pOutputVector),V ); + pOutputVector += sizeof(XMFLOAT3)*4; + + i += 4; + } + } + } + + if ( i < VectorCount) + { + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); + pInputVector += InputStride; + + XMVECTOR vResult = vmlaq_lane_f32( Transform.r[3], Transform.r[0], VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, Transform.r[1], VL, 1 ); // Y + vResult = vmlaq_lane_f32( vResult, Transform.r[2], VH, 0 ); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32( VH, 1 ); + + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32( W ); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + vResult = vmulq_f32( vResult, Reciprocal ); + + vResult = vmlaq_f32( Offset, vResult, Scale ); + + VL = vget_low_f32( vResult ); + vst1_f32( reinterpret_cast(pOutputVector), VL ); + vst1q_lane_f32( reinterpret_cast(pOutputVector)+2, vResult, 2 ); + pOutputVector += OutputStride; + } + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if ( !((uintptr_t)pOutputStream & 0xF) ) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V1 = _mm_add_ps( vTemp, Offset ); + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V2 = _mm_add_ps( vTemp, Offset ); + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V3 = _mm_add_ps( vTemp, Offset ); + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V4 = _mm_add_ps( vTemp, Offset ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS( reinterpret_cast(pOutputVector), V1 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+16), vTemp ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V1 = _mm_add_ps( vTemp, Offset ); + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V2 = _mm_add_ps( vTemp, Offset ); + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V3 = _mm_add_ps( vTemp, Offset ); + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + V4 = _mm_add_ps( vTemp, Offset ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps( reinterpret_cast(pOutputVector), V1 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+16), vTemp ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + vTemp = _mm_add_ps( vTemp, Offset ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + vTemp = _mm_add_ps( vTemp, Offset ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + vTemp = _mm_add_ps( vTemp, Offset ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + vTemp = _mm_add_ps( vTemp, Offset ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" ) + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + vTemp = _mm_mul_ps( vTemp, Scale ); + vTemp = _mm_add_ps( vTemp, Offset ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector3Unproject +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); + + return XMVector3TransformCoord(Result, Transform); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + FXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT3)); + + assert(OutputStride >= sizeof(XMFLOAT3)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3)); + +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + + XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); + + Result = XMVector3TransformCoord(Result, Transform); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + float sx = 1.f / (ViewportWidth * 0.5f); + float sy = 1.f / (-ViewportHeight * 0.5f); + float sz = 1.f / (ViewportMaxZ - ViewportMinZ); + + float ox = (-ViewportX * sx) - 1.f; + float oy = (-ViewportY * sy) + 1.f; + float oz = (-ViewportMinZ * sz); + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x3_t V = vld3q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT3)*4; + + XMVECTOR ScaleX = vdupq_n_f32(sx); + XMVECTOR OffsetX = vdupq_n_f32(ox); + XMVECTOR VX = vmlaq_f32( OffsetX, ScaleX, V.val[0] ); + + float32x2_t r3 = vget_low_f32( Transform.r[3] ); + float32x2_t r = vget_low_f32( Transform.r[0] ); + XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Ax+M + XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Bx+N + + __prefetch( pInputVector ); + + r3 = vget_high_f32( Transform.r[3] ); + r = vget_high_f32( Transform.r[0] ); + XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Cx+O + XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Dx+P + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + XMVECTOR ScaleY = vdupq_n_f32(sy); + XMVECTOR OffsetY = vdupq_n_f32(oy); + XMVECTOR VY = vmlaq_f32( OffsetY, ScaleY, V.val[1] ); + + r = vget_low_f32( Transform.r[1] ); + vResult0 = vmlaq_lane_f32( vResult0, VY, r, 0 ); // Ax+Ey+M + vResult1 = vmlaq_lane_f32( vResult1, VY, r, 1 ); // Bx+Fy+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( Transform.r[1] ); + vResult2 = vmlaq_lane_f32( vResult2, VY, r, 0 ); // Cx+Gy+O + W = vmlaq_lane_f32( W, VY, r, 1 ); // Dx+Hy+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + XMVECTOR ScaleZ = vdupq_n_f32(sz); + XMVECTOR OffsetZ = vdupq_n_f32(oz); + XMVECTOR VZ = vmlaq_f32( OffsetZ, ScaleZ, V.val[2] ); + + r = vget_low_f32( Transform.r[2] ); + vResult0 = vmlaq_lane_f32( vResult0, VZ, r, 0 ); // Ax+Ey+Iz+M + vResult1 = vmlaq_lane_f32( vResult1, VZ, r, 1 ); // Bx+Fy+Jz+N + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + + r = vget_high_f32( Transform.r[2] ); + vResult2 = vmlaq_lane_f32( vResult2, VZ, r, 0 ); // Cx+Gy+Kz+O + W = vmlaq_lane_f32( W, VZ, r, 1 ); // Dx+Hy+Lz+P + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + + // 2 iterations of Newton-Raphson refinement of reciprocal + float32x4_t Reciprocal = vrecpeq_f32(W); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + V.val[0] = vmulq_f32( vResult0, Reciprocal ); + V.val[1] = vmulq_f32( vResult1, Reciprocal ); + V.val[2] = vmulq_f32( vResult2, Reciprocal ); + + vst3q_f32( reinterpret_cast(pOutputVector),V ); + pOutputVector += sizeof(XMFLOAT3)*4; + + i += 4; + } + } + } + + if (i < VectorCount) + { + float32x2_t ScaleL = vcreate_f32(((uint64_t)*(const uint32_t *)&sx) | ((uint64_t)(*(const uint32_t *)&sy) << 32)); + float32x2_t ScaleH = vcreate_f32((uint64_t)*(const uint32_t *)&sz); + + float32x2_t OffsetL = vcreate_f32(((uint64_t)*(const uint32_t *)&ox) | ((uint64_t)(*(const uint32_t *)&oy) << 32)); + float32x2_t OffsetH = vcreate_f32((uint64_t)*(const uint32_t *)&oz); + + for (; i < VectorCount; i++) + { + float32x2_t VL = vld1_f32( reinterpret_cast(pInputVector) ); + float32x2_t zero = vdup_n_f32(0); + float32x2_t VH = vld1_lane_f32( reinterpret_cast(pInputVector)+2, zero, 0 ); + pInputVector += InputStride; + + VL = vmla_f32( OffsetL, VL, ScaleL ); + VH = vmla_f32( OffsetH, VH, ScaleH ); + + XMVECTOR vResult = vmlaq_lane_f32( Transform.r[3], Transform.r[0], VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, Transform.r[1], VL, 1 ); // Y + vResult = vmlaq_lane_f32( vResult, Transform.r[2], VH, 0 ); // Z + + VH = vget_high_f32(vResult); + XMVECTOR W = vdupq_lane_f32( VH, 1 ); + + // 2 iterations of Newton-Raphson refinement of reciprocal for W + float32x4_t Reciprocal = vrecpeq_f32( W ); + float32x4_t S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, W ); + Reciprocal = vmulq_f32( S, Reciprocal ); + + vResult = vmulq_f32( vResult, Reciprocal ); + + VL = vget_low_f32( vResult ); + vst1_f32( reinterpret_cast(pOutputVector), VL ); + vst1q_lane_f32( reinterpret_cast(pOutputVector)+2, vResult, 2 ); + pOutputVector += OutputStride; + } + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = _mm_mul_ps(Scale, Offset); + Offset = _mm_add_ps(Offset, D); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(nullptr, Transform); + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(XMFLOAT3)) + { + if (OutputStride == sizeof(XMFLOAT3)) + { + if ( !((uintptr_t)pOutputStream & 0xF) ) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + V1 = _mm_mul_ps( V1, Scale ); + V1 = _mm_add_ps( V1, Offset ); + + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V1 = _mm_div_ps( vTemp, W ); + + // Result 2 + V2 = _mm_mul_ps( V2, Scale ); + V2 = _mm_add_ps( V2, Offset ); + + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V2 = _mm_div_ps( vTemp, W ); + + // Result 3 + V3 = _mm_mul_ps( V3, Scale ); + V3 = _mm_add_ps( V3, Offset ); + + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V3 = _mm_div_ps( vTemp, W ); + + // Result 4 + V4 = _mm_mul_ps( V4, Scale ); + V4 = _mm_add_ps( V4, Offset ); + + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V4 = _mm_div_ps( vTemp, W ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + XM_STREAM_PS( reinterpret_cast(pOutputVector), V1 ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+16), vTemp ); + XM_STREAM_PS( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + else + { + // Packed input, unaligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + V1 = _mm_mul_ps( V1, Scale ); + V1 = _mm_add_ps( V1, Offset ); + + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V1 = _mm_div_ps( vTemp, W ); + + // Result 2 + V2 = _mm_mul_ps( V2, Scale ); + V2 = _mm_add_ps( V2, Offset ); + + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V2 = _mm_div_ps( vTemp, W ); + + // Result 3 + V3 = _mm_mul_ps( V3, Scale ); + V3 = _mm_add_ps( V3, Offset ); + + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V3 = _mm_div_ps( vTemp, W ); + + // Result 4 + V4 = _mm_mul_ps( V4, Scale ); + V4 = _mm_add_ps( V4, Offset ); + + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + V4 = _mm_div_ps( vTemp, W ); + + // Pack and store the vectors + XM3PACK4INTO3(vTemp); + _mm_storeu_ps( reinterpret_cast(pOutputVector), V1 ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+16), vTemp ); + _mm_storeu_ps( reinterpret_cast(pOutputVector+32), V3 ); + pOutputVector += sizeof(XMFLOAT3)*4; + i += 4; + } + } + } + else + { + // Packed input, unpacked output + for (size_t j = 0; j < four; ++j) + { + __m128 V1 = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + __m128 L2 = _mm_loadu_ps( reinterpret_cast(pInputVector+16) ); + __m128 L3 = _mm_loadu_ps( reinterpret_cast(pInputVector+32) ); + pInputVector += sizeof(XMFLOAT3)*4; + + // Unpack the 4 vectors (.w components are junk) + XM3UNPACK3INTO4(V1,L2,L3); + + // Result 1 + V1 = _mm_mul_ps( V1, Scale ); + V1 = _mm_add_ps( V1, Offset ); + + XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 2 + V2 = _mm_mul_ps( V2, Scale ); + V2 = _mm_add_ps( V2, Offset ); + + Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 3 + V3 = _mm_mul_ps( V3, Scale ); + V3 = _mm_add_ps( V3, Offset ); + + Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + // Result 4 + V4 = _mm_mul_ps( V4, Scale ); + V4 = _mm_add_ps( V4, Offset ); + + Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) ); + Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) ); + X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) ); + + vTemp = _mm_mul_ps( Z, Transform.r[2] ); + vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + + i += 4; + } + } + } + } + + for (; i < VectorCount; i++) + { + #pragma prefast( suppress : 26019, "PREfast noise: Esp:1307" ) + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pInputVector)); + pInputVector += InputStride; + + V = _mm_mul_ps( V, Scale ); + V = _mm_add_ps( V, Offset ); + + XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); + XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); + XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); + + XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] ); + XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] ); + XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] ); + vTemp = _mm_add_ps( vTemp, Transform.r[3] ); + vTemp = _mm_add_ps( vTemp, vTemp2 ); + vTemp = _mm_add_ps( vTemp, vTemp3 ); + + XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) ); + vTemp = _mm_div_ps( vTemp, W ); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat3(reinterpret_cast(pOutputVector), vTemp); + pOutputVector += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +/**************************************************************************** + * + * 4D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2]) && + (V1.vector4_f32[3] == V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2]) && + (V1.vector4_f32[3] != V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp); + uint32_t CR = 0; + if (iTest==0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest==0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))==0xf) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if (V1.vector4_u32[0] == V2.vector4_u32[0] && + V1.vector4_u32[1] == V2.vector4_u32[1] && + V1.vector4_u32[2] == V2.vector4_u32[2] && + V1.vector4_u32[3] == V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_u32[0] != V2.vector4_u32[0] && + V1.vector4_u32[1] != V2.vector4_u32[1] && + V1.vector4_u32[2] != V2.vector4_u32[2] && + V1.vector4_u32[3] != V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)); + uint32_t CR = 0; + if (iTest==0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest==0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +inline bool XM_CALLCONV XMVector4NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + float dx, dy, dz, dw; + + dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]); + dw = fabsf(V1.vector4_f32[3]-V2.vector4_f32[3]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2]) && + (dw <= Epsilon.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vDelta = vsubq_f32( V1, V2 ); + uint32x4_t vResult = vacleq_f32( vDelta, Epsilon ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + return ((_mm_movemask_ps(vTemp)==0xf) != 0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpneq_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))!=0xF) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if (V1.vector4_f32[0] > V2.vector4_f32[0] && + V1.vector4_f32[1] > V2.vector4_f32[1] && + V1.vector4_f32[2] > V2.vector4_f32[2] && + V1.vector4_f32[3] > V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_f32[0] <= V2.vector4_f32[0] && + V1.vector4_f32[1] <= V2.vector4_f32[1] && + V1.vector4_f32[2] <= V2.vector4_f32[2] && + V1.vector4_f32[3] <= V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + uint32_t CR = 0; + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XM_CALLCONV XMVector4GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2]) && + (V1.vector4_f32[3] >= V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2]) && + (V1.vector4_f32[3] < V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + uint32_t CR = 0; + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0x0f) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcltq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vResult = vcleq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) && + (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + uint32x4_t ivTemp1 = vcleq_f32(V,Bounds); + // Negate the bounds + float32x4_t vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + uint32x4_t ivTemp2 = vcleq_f32(vTemp2,V); + // Blend answers + ivTemp1 = vandq_u32(ivTemp1,ivTemp2); + // in bounds? + int8x8x2_t vTemp = vzip_u8(vget_low_u8(ivTemp1), vget_high_u8(ivTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // All in bounds? + return ((_mm_movemask_ps(vTemp1)==0x0f) != 0); +#else + return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4IsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2]) || + XMISNAN(V.vector4_f32[3])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + uint32x4_t vTempNan = vceqq_f32( V, V ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + // If any are NaN, the mask is zero + return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); + // If any are NaN, the mask is non-zero + return (_mm_movemask_ps(vTempNan)!=0); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XM_CALLCONV XMVector4IsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2]) || + XMISINF(V.vector4_f32[3])); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + uint32x4_t vTempInf = vandq_u32( V, g_XMAbsMask ); + // Compare to infinity + vTempInf = vceqq_f32(vTempInf, g_XMInfinity ); + // If any are infinity, the signs are true. + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) != 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + XMVECTOR vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp) != 0); +#endif +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = + Result.vector4_f32[1] = + Result.vector4_f32[2] = + Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vTemp = vmulq_f32( V1, V2 ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + return vcombine_f32( v1, v1 ); +#elif defined(_XM_SSE4_INTRINSICS_) + return _mm_dp_ps( V1, V2, 0xff ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vTemp = _mm_mul_ps(V1, V2); + vTemp = _mm_hadd_ps(vTemp, vTemp); + return _mm_hadd_ps(vTemp, vTemp); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp2 = V2; + XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position + vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W; + vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position + vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together + return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Cross +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w), + // ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w), + // ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w), + // ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + + Result.vector4_f32[0] = (((V2.vector4_f32[2]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[2]))*V1.vector4_f32[1])-(((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[2])+(((V2.vector4_f32[1]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[1]))*V1.vector4_f32[3]); + Result.vector4_f32[1] = (((V2.vector4_f32[3]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[3]))*V1.vector4_f32[0])-(((V2.vector4_f32[3]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[3]))*V1.vector4_f32[2])+(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[3]); + Result.vector4_f32[2] = (((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[0])-(((V2.vector4_f32[0]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[0]))*V1.vector4_f32[1])+(((V2.vector4_f32[0]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[0]))*V1.vector4_f32[3]); + Result.vector4_f32[3] = (((V2.vector4_f32[2]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[2]))*V1.vector4_f32[0])-(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[1])+(((V2.vector4_f32[1]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[1]))*V1.vector4_f32[2]); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const float32x2_t select = vget_low_f32( g_XMMaskX ); + + // Term1: V2zwyz * V3wzwy + const float32x2_t v2xy = vget_low_f32(V2); + const float32x2_t v2zw = vget_high_f32(V2); + const float32x2_t v2yx = vrev64_f32(v2xy); + const float32x2_t v2wz = vrev64_f32(v2zw); + const float32x2_t v2yz = vbsl_f32( select, v2yx, v2wz ); + + const float32x2_t v3zw = vget_high_f32(V3); + const float32x2_t v3wz = vrev64_f32(v3zw); + const float32x2_t v3xy = vget_low_f32(V3); + const float32x2_t v3wy = vbsl_f32( select, v3wz, v3xy ); + + float32x4_t vTemp1 = vcombine_f32(v2zw,v2yz); + float32x4_t vTemp2 = vcombine_f32(v3wz,v3wy); + XMVECTOR vResult = vmulq_f32( vTemp1, vTemp2 ); + + // - V2wzwy * V3zwyz + const float32x2_t v2wy = vbsl_f32( select, v2wz, v2xy ); + + const float32x2_t v3yx = vrev64_f32(v3xy); + const float32x2_t v3yz = vbsl_f32( select, v3yx, v3wz ); + + vTemp1 = vcombine_f32(v2wz,v2wy); + vTemp2 = vcombine_f32(v3zw,v3yz); + vResult = vmlsq_f32( vResult, vTemp1, vTemp2 ); + + // term1 * V1yxxx + const float32x2_t v1xy = vget_low_f32(V1); + const float32x2_t v1yx = vrev64_f32(v1xy); + + vTemp1 = vcombine_f32( v1yx, vdup_lane_f32( v1yx, 1 ) ); + vResult = vmulq_f32( vResult, vTemp1 ); + + // Term2: V2ywxz * V3wxwx + const float32x2_t v2yw = vrev64_f32(v2wy); + const float32x2_t v2xz = vbsl_f32( select, v2xy, v2wz ); + + const float32x2_t v3wx = vbsl_f32( select, v3wz, v3yx ); + + vTemp1 = vcombine_f32(v2yw,v2xz); + vTemp2 = vcombine_f32(v3wx,v3wx); + float32x4_t vTerm = vmulq_f32( vTemp1, vTemp2 ); + + // - V2wxwx * V3ywxz + const float32x2_t v2wx = vbsl_f32( select, v2wz, v2yx ); + + const float32x2_t v3yw = vrev64_f32(v3wy); + const float32x2_t v3xz = vbsl_f32( select, v3xy, v3wz ); + + vTemp1 = vcombine_f32(v2wx,v2wx); + vTemp2 = vcombine_f32(v3yw,v3xz); + vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 ); + + // vResult - term2 * V1zzyy + const float32x2_t v1zw = vget_high_f32(V1); + + vTemp1 = vcombine_f32( vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0) ); + vResult = vmlsq_f32( vResult, vTerm, vTemp1 ); + + // Term3: V2yzxy * V3zxyx + const float32x2_t v3zx = vrev64_f32(v3xz); + + vTemp1 = vcombine_f32(v2yz,v2xy); + vTemp2 = vcombine_f32(v3zx,v3yx); + vTerm = vmulq_f32( vTemp1, vTemp2 ); + + // - V2zxyx * V3yzxy + const float32x2_t v2zx = vrev64_f32(v2xz); + + vTemp1 = vcombine_f32(v2zx,v2yx); + vTemp2 = vcombine_f32(v3yz,v3xy); + vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 ); + + // vResult + term3 * V1wwwz + const float32x2_t v1wz = vrev64_f32(v1zw); + + vTemp1 = vcombine_f32( vdup_lane_f32( v1wz, 0 ), v1wz ); + return vmlaq_f32( vResult, vTerm, vTemp1 ); +#elif defined(_XM_SSE_INTRINSICS_) + // V2zwyz * V3wzwy + XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,1,3,2)); + XMVECTOR vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,3,2,3)); + vResult = _mm_mul_ps(vResult,vTemp3); + // - V2wzwy * V3zwyz + XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,3,2,3)); + vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(1,3,0,1)); + vTemp2 = _mm_mul_ps(vTemp2,vTemp3); + vResult = _mm_sub_ps(vResult,vTemp2); + // term1 * V1yxxx + XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,0,0,1)); + vResult = _mm_mul_ps(vResult,vTemp1); + + // V2ywxz * V3wxwx + vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,3,1)); + vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,3,0,3)); + vTemp3 = _mm_mul_ps(vTemp3,vTemp2); + // - V2wxwx * V3ywxz + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,1,2,1)); + vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(2,0,3,1)); + vTemp2 = _mm_mul_ps(vTemp2,vTemp1); + vTemp3 = _mm_sub_ps(vTemp3,vTemp2); + // vResult - temp * V1zzyy + vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(1,1,2,2)); + vTemp1 = _mm_mul_ps(vTemp1,vTemp3); + vResult = _mm_sub_ps(vResult,vTemp1); + + // V2yzxy * V3zxyx + vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,0,2,1)); + vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,1,0,2)); + vTemp3 = _mm_mul_ps(vTemp3,vTemp2); + // - V2zxyx * V3yzxy + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,0,2,1)); + vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,0,2,1)); + vTemp1 = _mm_mul_ps(vTemp1,vTemp2); + vTemp3 = _mm_sub_ps(vTemp3,vTemp1); + // vResult + term * V1wwwz + vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,3,3,3)); + vTemp3 = _mm_mul_ps(vTemp3,vTemp1); + vResult = _mm_add_ps(vResult,vTemp3); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4LengthSq +( + FXMVECTOR V +) +{ + return XMVector4Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32( v1 ); + return vcombine_f32(v2, v2); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + return _mm_rsqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + // Reciprocal sqrt + float32x2_t S0 = vrsqrte_f32(v1); + float32x2_t P0 = vmul_f32( v1, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( v1, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t Result = vmul_f32( S1, R1 ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp ); + return _mm_div_ps( g_XMOne, vLengthSq ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + vLengthSq = _mm_div_ps(g_XMOne, vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the reciprocal + vLengthSq = _mm_sqrt_ps(vLengthSq); + // Accurate! + vLengthSq = _mm_div_ps(g_XMOne,vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4LengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32( v1, zero ); + // Sqrt (estimate) + float32x2_t Result = vrsqrte_f32( v1 ); + Result = vmul_f32( v1, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + return _mm_sqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Length +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + const float32x2_t zero = vdup_n_f32(0); + uint32x2_t VEqualsZero = vceq_f32( v1, zero ); + // Sqrt + float32x2_t S0 = vrsqrte_f32( v1 ); + float32x2_t P0 = vmul_f32( v1, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( v1, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + float32x2_t Result = vmul_f32( S1, R1 ); + Result = vmul_f32( v1, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + return _mm_sqrt_ps( vTemp ); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#endif +} + +//------------------------------------------------------------------------------ +// XMVector4NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector4ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32( v1 ); + // Normalize + return vmulq_f32( V, vcombine_f32(v2,v2) ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff ); + XMVECTOR vResult = _mm_rsqrt_ps( vTemp ); + return _mm_mul_ps(vResult, V); +#elif defined(_XM_SSE3_INTRINSICS_) + XMVECTOR vDot = _mm_mul_ps(V, V); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_hadd_ps(vDot, vDot); + vDot = _mm_rsqrt_ps(vDot); + vDot = _mm_mul_ps(vDot, V); + return vDot; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the reciprocal + XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq); + // Reciprocal mul to perform the normalization + vResult = _mm_mul_ps(vResult,V); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Normalize +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fLength; + XMVECTOR vResult; + + vResult = XMVector4Length( V ); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f/fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; + vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; + vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; + vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + float32x4_t vTemp = vmulq_f32( V, V ); + float32x2_t v1 = vget_low_f32( vTemp ); + float32x2_t v2 = vget_high_f32( vTemp ); + v1 = vadd_f32( v1, v2 ); + v1 = vpadd_f32( v1, v1 ); + uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) ); + uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) ); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + float32x2_t S0 = vrsqrte_f32( v1 ); + float32x2_t P0 = vmul_f32( v1, S0 ); + float32x2_t R0 = vrsqrts_f32( P0, S0 ); + float32x2_t S1 = vmul_f32( S0, R0 ); + float32x2_t P1 = vmul_f32( v1, S1 ); + float32x2_t R1 = vrsqrts_f32( P1, S1 ); + v2 = vmul_f32( S1, R1 ); + // Normalize + XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) ); + vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult ); + return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult ); +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff ); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#elif defined(_XM_SSE3_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V, V); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + + return XMVector4ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero())); + assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero())); + assert(XMVector4GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector4LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result = XMVector4Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector4RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR IDotN; + XMVECTOR R; + const XMVECTOR Zero = XMVectorZero(); + + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + + IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + XMVECTOR Result; + + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR IDotN = XMVector4Dot(Incident,Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float32x4_t R = vmlsq_f32( g_XMOne, IDotN, IDotN); + R = vmulq_f32(R, RefractionIndex); + R = vmlsq_f32(g_XMOne, R, RefractionIndex ); + + uint32x4_t vResult = vcleq_f32(R,g_XMZero); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // Sqrt(R) + float32x4_t S0 = vrsqrteq_f32(R); + float32x4_t P0 = vmulq_f32( R, S0 ); + float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); + float32x4_t S1 = vmulq_f32( S0, R0 ); + float32x4_t P1 = vmulq_f32( R, S1 ); + float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); + float32x4_t S2 = vmulq_f32( S1, R1 ); + R = vmulq_f32( R, S2 ); + // R = RefractionIndex * IDotN + sqrt(R) + R = vmlaq_f32( R, RefractionIndex, IDotN ); + // Result = RefractionIndex * Incident - Normal * R + vResult = vmulq_f32(RefractionIndex, Incident); + vResult = vmlsq_f32( vResult, R, Normal ); + } + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR IDotN = XMVector4Dot(Incident,Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = _mm_mul_ps(IDotN,IDotN); + R = _mm_sub_ps(g_XMOne,R); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_sub_ps(g_XMOne,R); + + XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero); + if (_mm_movemask_ps(vResult)==0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + vResult = _mm_mul_ps(RefractionIndex, IDotN); + R = _mm_add_ps(R,vResult); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + R = _mm_mul_ps(R,Normal); + vResult = _mm_sub_ps(vResult,R); + } + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Orthogonal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = V.vector4_f32[2]; + Result.vector4_f32[1] = V.vector4_f32[3]; + Result.vector4_f32[2] = -V.vector4_f32[0]; + Result.vector4_f32[3] = -V.vector4_f32[1]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { 1.f, 1.f, -1.f, -1.f }; + + float32x4_t Result = vcombine_f32( vget_high_f32( V ), vget_low_f32( V ) ); + return vmulq_f32( Result, Negate ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FlipZW = {1.0f,1.0f,-1.0f,-1.0f}; + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,0,3,2)); + vResult = _mm_mul_ps(vResult,FlipZW); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ + XMVECTOR Result = XMVector4Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ + XMVECTOR Result = XMVector4Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACos(Result); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + XMVECTOR L1 = XMVector4ReciprocalLength(V1); + XMVECTOR L2 = XMVector4ReciprocalLength(V2); + + XMVECTOR Dot = XMVector4Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV XMVector4Transform +( + FXMVECTOR V, + FXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]); + float fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]); + float fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]); + float fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]); + XMVECTOR vResult; + vResult.vector4_f32[0] = fX; + vResult.vector4_f32[1] = fY; + vResult.vector4_f32[2] = fZ; + vResult.vector4_f32[3] = fW; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x2_t VL = vget_low_f32( V ); + XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y + float32x2_t VH = vget_high_f32( V ); + vResult = vmlaq_lane_f32( vResult, M.r[2], VH, 0 ); // Z + return vmlaq_lane_f32( vResult, M.r[3], VH, 1 ); // W +#elif defined(_XM_SSE_INTRINSICS_) + // Splat x,y,z and w + XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + // Mul by the matrix + vTempX = _mm_mul_ps(vTempX,M.r[0]); + vTempY = _mm_mul_ps(vTempY,M.r[1]); + vTempZ = _mm_mul_ps(vTempZ,M.r[2]); + vTempW = _mm_mul_ps(vTempW,M.r[3]); + // Add them all together + vTempX = _mm_add_ps(vTempX,vTempY); + vTempZ = _mm_add_ps(vTempZ,vTempW); + vTempX = _mm_add_ps(vTempX,vTempZ); + return vTempX; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT4* pInputStream, + size_t InputStride, + size_t VectorCount, + FXMMATRIX M +) +{ + assert(pOutputStream != nullptr); + assert(pInputStream != nullptr); + + assert(InputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(InputStride >= sizeof(XMFLOAT4)); + + assert(OutputStride >= sizeof(XMFLOAT4)); + _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4)); + +#if defined(_XM_NO_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat4((const XMFLOAT4*)pInputVector); + XMVECTOR W = XMVectorSplatW(V); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(W, row3); + Result = XMVectorMultiplyAdd(Z, row2, Result); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + #pragma prefast( suppress : 26015, "PREfast noise: Esp:1307" ) + XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + size_t i = 0; + size_t four = VectorCount >> 2; + if ( four > 0 ) + { + if ((InputStride == sizeof(XMFLOAT4)) && (OutputStride == sizeof(XMFLOAT4))) + { + for (size_t j = 0; j < four; ++j) + { + float32x4x4_t V = vld4q_f32( reinterpret_cast(pInputVector) ); + pInputVector += sizeof(XMFLOAT4)*4; + + float32x2_t r = vget_low_f32( row0 ); + XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax + XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx + + __prefetch( pInputVector ); + + r = vget_high_f32( row0 ); + XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx + XMVECTOR vResult3 = vmulq_lane_f32( V.val[0], r, 1 ); // Dx + + __prefetch( pInputVector+XM_CACHE_LINE_SIZE ); + + r = vget_low_f32( row1 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey + vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) ); + + r = vget_high_f32( row1 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy + vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) ); + + r = vget_low_f32( row2 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz + vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) ); + + r = vget_high_f32( row2 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz + vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) ); + + r = vget_low_f32( row3 ); + vResult0 = vmlaq_lane_f32( vResult0, V.val[3], r, 0 ); // Ax+Ey+Iz+Mw + vResult1 = vmlaq_lane_f32( vResult1, V.val[3], r, 1 ); // Bx+Fy+Jz+Nw + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*6) ); + + r = vget_high_f32( row3 ); + vResult2 = vmlaq_lane_f32( vResult2, V.val[3], r, 0 ); // Cx+Gy+Kz+Ow + vResult3 = vmlaq_lane_f32( vResult3, V.val[3], r, 1 ); // Dx+Hy+Lz+Pw + + __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*7) ); + + V.val[0] = vResult0; + V.val[1] = vResult1; + V.val[2] = vResult2; + V.val[3] = vResult3; + + vst4q_f32( reinterpret_cast(pOutputVector), V ); + pOutputVector += sizeof(XMFLOAT4)*4; + + i += 4; + } + } + } + + for (; i < VectorCount; i++) + { + XMVECTOR V = vld1q_f32( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + float32x2_t VL = vget_low_f32( V ); + XMVECTOR vResult = vmulq_lane_f32( row0, VL, 0 ); // X + vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y + float32x2_t VH = vget_high_f32( V ); + vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z + vResult = vmlaq_lane_f32( vResult, row3, VH, 1 ); // W + + vst1q_f32( reinterpret_cast(pOutputVector), vResult ); + pOutputVector += OutputStride; + } + + return pOutputStream; +#elif defined(_XM_SSE_INTRINSICS_) + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) ) + { + if ( !((uintptr_t)pInputStream & 0xF) && !(InputStride & 0xF) ) + { + // Aligned input, aligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_load_ps( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + + vTempX = _mm_mul_ps(vTempX,row0); + vTempY = _mm_mul_ps(vTempY,row1); + vTempZ = _mm_mul_ps(vTempZ,row2); + vTempW = _mm_mul_ps(vTempW,row3); + + vTempX = _mm_add_ps(vTempX,vTempY); + vTempZ = _mm_add_ps(vTempZ,vTempW); + vTempX = _mm_add_ps(vTempX,vTempZ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTempX ); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input, aligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + + vTempX = _mm_mul_ps(vTempX,row0); + vTempY = _mm_mul_ps(vTempY,row1); + vTempZ = _mm_mul_ps(vTempZ,row2); + vTempW = _mm_mul_ps(vTempW,row3); + + vTempX = _mm_add_ps(vTempX,vTempY); + vTempZ = _mm_add_ps(vTempZ,vTempW); + vTempX = _mm_add_ps(vTempX,vTempZ); + + XM_STREAM_PS( reinterpret_cast(pOutputVector), vTempX ); + pOutputVector += OutputStride; + } + } + } + else + { + if ( !((uintptr_t)pInputStream & 0xF) && !(InputStride & 0xF) ) + { + // Aligned input, unaligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_load_ps( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + + vTempX = _mm_mul_ps(vTempX,row0); + vTempY = _mm_mul_ps(vTempY,row1); + vTempZ = _mm_mul_ps(vTempZ,row2); + vTempW = _mm_mul_ps(vTempW,row3); + + vTempX = _mm_add_ps(vTempX,vTempY); + vTempZ = _mm_add_ps(vTempZ,vTempW); + vTempX = _mm_add_ps(vTempX,vTempZ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTempX ); + pOutputVector += OutputStride; + } + } + else + { + // Unaligned input, unaligned output + for (size_t i = 0; i < VectorCount; i++) + { + __m128 V = _mm_loadu_ps( reinterpret_cast(pInputVector) ); + pInputVector += InputStride; + + XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + + vTempX = _mm_mul_ps(vTempX,row0); + vTempY = _mm_mul_ps(vTempY,row1); + vTempZ = _mm_mul_ps(vTempZ,row2); + vTempW = _mm_mul_ps(vTempW,row3); + + vTempX = _mm_add_ps(vTempX,vTempY); + vTempZ = _mm_add_ps(vTempZ,vTempW); + vTempX = _mm_add_ps(vTempX,vTempZ); + + _mm_storeu_ps( reinterpret_cast(pOutputVector), vTempX ); + pOutputVector += OutputStride; + } + } + } + + XM_SFENCE(); + + return pOutputStream; +#endif +} + +/**************************************************************************** + * + * XMVECTOR operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V) +{ + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator- (FXMVECTOR V) +{ + return XMVectorNegate(V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator+= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorAdd(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator-= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorSubtract(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator*= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorMultiply(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& XM_CALLCONV operator/= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorDivide(V1,V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator*= +( + XMVECTOR& V, + const float S +) +{ + V = XMVectorScale(V, S); + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator/= +( + XMVECTOR& V, + const float S +) +{ + XMVECTOR vS = XMVectorReplicate( S ); + V = XMVectorDivide(V, vS); + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator+ +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorAdd(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator- +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorSubtract(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorMultiply(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator/ +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorDivide(V1,V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + FXMVECTOR V, + const float S +) +{ + return XMVectorScale(V, S); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator/ +( + FXMVECTOR V, + const float S +) +{ + XMVECTOR vS = XMVectorReplicate( S ); + return XMVectorDivide(V, vS); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XM_CALLCONV operator* +( + float S, + FXMVECTOR V +) +{ + return XMVectorScale(V, S); +} + +#if defined(_XM_NO_INTRINSICS_) +#undef XMISNAN +#undef XMISINF +#endif + +#if defined(_XM_SSE_INTRINSICS_) +#undef XM3UNPACK3INTO4 +#undef XM3PACK4INTO3 +#endif diff --git a/Inc/DirectXPackedVector.h b/Inc/DirectXPackedVector.h index 635dd8a..cc092fb 100644 --- a/Inc/DirectXPackedVector.h +++ b/Inc/DirectXPackedVector.h @@ -1,1003 +1,1003 @@ -//------------------------------------------------------------------------------------- -// DirectXPackedVector.h -- SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#pragma once - -#include "DirectXMath.h" - -namespace DirectX -{ - -namespace PackedVector -{ - -#pragma warning(push) -#pragma warning(disable:4201 4365 4324) -// C4201: nonstandard extension used -// C4365: Off by default noise -// C4324: alignment padding warnings - -//------------------------------------------------------------------------------ -// ARGB Color; 8-8-8-8 bit unsigned normalized integer components packed into -// a 32 bit integer. The normalized color is packed into 32 bits using 8 bit -// unsigned, normalized integers for the alpha, red, green, and blue components. -// The alpha component is stored in the most significant bits and the blue -// component in the least significant bits (A8R8G8B8): -// [32] aaaaaaaa rrrrrrrr gggggggg bbbbbbbb [0] -struct XMCOLOR -{ - union - { - struct - { - uint8_t b; // Blue: 0/255 to 255/255 - uint8_t g; // Green: 0/255 to 255/255 - uint8_t r; // Red: 0/255 to 255/255 - uint8_t a; // Alpha: 0/255 to 255/255 - }; - uint32_t c; - }; - - XMCOLOR() XM_CTOR_DEFAULT - XM_CONSTEXPR XMCOLOR(uint32_t Color) : c(Color) {} - XMCOLOR(float _r, float _g, float _b, float _a); - explicit XMCOLOR(_In_reads_(4) const float *pArray); - - operator uint32_t () const { return c; } - - XMCOLOR& operator= (const XMCOLOR& Color) { c = Color.c; return *this; } - XMCOLOR& operator= (const uint32_t Color) { c = Color; return *this; } -}; - -//------------------------------------------------------------------------------ -// 16 bit floating point number consisting of a sign bit, a 5 bit biased -// exponent, and a 10 bit mantissa -typedef uint16_t HALF; - -//------------------------------------------------------------------------------ -// 2D Vector; 16 bit floating point components -struct XMHALF2 -{ - union - { - struct - { - HALF x; - HALF y; - }; - uint32_t v; - }; - - XMHALF2() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMHALF2(uint32_t Packed) : v(Packed) {} - XM_CONSTEXPR XMHALF2(HALF _x, HALF _y) : x(_x), y(_y) {} - explicit XMHALF2(_In_reads_(2) const HALF *pArray) : x(pArray[0]), y(pArray[1]) {} - XMHALF2(float _x, float _y); - explicit XMHALF2(_In_reads_(2) const float *pArray); - - XMHALF2& operator= (const XMHALF2& Half2) { x = Half2.x; y = Half2.y; return *this; } - XMHALF2& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -//------------------------------------------------------------------------------ -// 2D Vector; 16 bit signed normalized integer components -struct XMSHORTN2 -{ - union - { - struct - { - int16_t x; - int16_t y; - }; - uint32_t v; - }; - - XMSHORTN2() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMSHORTN2(uint32_t Packed) : v(Packed) {} - XM_CONSTEXPR XMSHORTN2(int16_t _x, int16_t _y) : x(_x), y(_y) {} - explicit XMSHORTN2(_In_reads_(2) const int16_t *pArray) : x(pArray[0]), y(pArray[1]) {} - XMSHORTN2(float _x, float _y); - explicit XMSHORTN2(_In_reads_(2) const float *pArray); - - XMSHORTN2& operator= (const XMSHORTN2& ShortN2) { x = ShortN2.x; y = ShortN2.y; return *this; } - XMSHORTN2& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -// 2D Vector; 16 bit signed integer components -struct XMSHORT2 -{ - union - { - struct - { - int16_t x; - int16_t y; - }; - uint32_t v; - }; - - XMSHORT2() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMSHORT2(uint32_t Packed) : v(Packed) {} - XM_CONSTEXPR XMSHORT2(int16_t _x, int16_t _y) : x(_x), y(_y) {} - explicit XMSHORT2(_In_reads_(2) const int16_t *pArray) : x(pArray[0]), y(pArray[1]) {} - XMSHORT2(float _x, float _y); - explicit XMSHORT2(_In_reads_(2) const float *pArray); - - XMSHORT2& operator= (const XMSHORT2& Short2) { x = Short2.x; y = Short2.y; return *this; } - XMSHORT2& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -// 2D Vector; 16 bit unsigned normalized integer components -struct XMUSHORTN2 -{ - union - { - struct - { - uint16_t x; - uint16_t y; - }; - uint32_t v; - }; - - XMUSHORTN2() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMUSHORTN2(uint32_t Packed) : v(Packed) {} - XM_CONSTEXPR XMUSHORTN2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {} - explicit XMUSHORTN2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {} - XMUSHORTN2(float _x, float _y); - explicit XMUSHORTN2(_In_reads_(2) const float *pArray); - - XMUSHORTN2& operator= (const XMUSHORTN2& UShortN2) { x = UShortN2.x; y = UShortN2.y; return *this; } - XMUSHORTN2& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -// 2D Vector; 16 bit unsigned integer components -struct XMUSHORT2 -{ - union - { - struct - { - uint16_t x; - uint16_t y; - }; - uint32_t v; - }; - - XMUSHORT2() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMUSHORT2(uint32_t Packed) : v(Packed) {} - XM_CONSTEXPR XMUSHORT2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {} - explicit XMUSHORT2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {} - XMUSHORT2(float _x, float _y); - explicit XMUSHORT2(_In_reads_(2) const float *pArray); - - XMUSHORT2& operator= (const XMUSHORT2& UShort2) { x = UShort2.x; y = UShort2.y; return *this; } - XMUSHORT2& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -//------------------------------------------------------------------------------ -// 2D Vector; 8 bit signed normalized integer components -struct XMBYTEN2 -{ - union - { - struct - { - int8_t x; - int8_t y; - }; - uint16_t v; - }; - - XMBYTEN2() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMBYTEN2(uint16_t Packed) : v(Packed) {} - XM_CONSTEXPR XMBYTEN2(int8_t _x, int8_t _y) : x(_x), y(_y) {} - explicit XMBYTEN2(_In_reads_(2) const int8_t *pArray) : x(pArray[0]), y(pArray[1]) {} - XMBYTEN2(float _x, float _y); - explicit XMBYTEN2(_In_reads_(2) const float *pArray); - - XMBYTEN2& operator= (const XMBYTEN2& ByteN2) { x = ByteN2.x; y = ByteN2.y; return *this; } - XMBYTEN2& operator= (uint16_t Packed) { v = Packed; return *this; } -}; - -// 2D Vector; 8 bit signed integer components -struct XMBYTE2 -{ - union - { - struct - { - int8_t x; - int8_t y; - }; - uint16_t v; - }; - - XMBYTE2() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMBYTE2(uint16_t Packed) : v(Packed) {} - XM_CONSTEXPR XMBYTE2(int8_t _x, int8_t _y) : x(_x), y(_y) {} - explicit XMBYTE2(_In_reads_(2) const int8_t *pArray) : x(pArray[0]), y(pArray[1]) {} - XMBYTE2(float _x, float _y); - explicit XMBYTE2(_In_reads_(2) const float *pArray); - - XMBYTE2& operator= (const XMBYTE2& Byte2) { x = Byte2.x; y = Byte2.y; return *this; } - XMBYTE2& operator= (uint16_t Packed) { v = Packed; return *this; } -}; - -// 2D Vector; 8 bit unsigned normalized integer components -struct XMUBYTEN2 -{ - union - { - struct - { - uint8_t x; - uint8_t y; - }; - uint16_t v; - }; - - XMUBYTEN2() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMUBYTEN2(uint16_t Packed) : v(Packed) {} - XM_CONSTEXPR XMUBYTEN2(uint8_t _x, uint8_t _y) : x(_x), y(_y) {} - explicit XMUBYTEN2(_In_reads_(2) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]) {} - XMUBYTEN2(float _x, float _y); - explicit XMUBYTEN2(_In_reads_(2) const float *pArray); - - XMUBYTEN2& operator= (const XMUBYTEN2& UByteN2) { x = UByteN2.x; y = UByteN2.y; return *this; } - XMUBYTEN2& operator= (uint16_t Packed) { v = Packed; return *this; } -}; - -// 2D Vector; 8 bit unsigned integer components -struct XMUBYTE2 -{ - union - { - struct - { - uint8_t x; - uint8_t y; - }; - uint16_t v; - }; - - XMUBYTE2() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMUBYTE2(uint16_t Packed) : v(Packed) {} - XM_CONSTEXPR XMUBYTE2(uint8_t _x, uint8_t _y) : x(_x), y(_y) {} - explicit XMUBYTE2(_In_reads_(2) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]) {} - XMUBYTE2(float _x, float _y); - explicit XMUBYTE2(_In_reads_(2) const float *pArray); - - XMUBYTE2& operator= (const XMUBYTE2& UByte2) { x = UByte2.x; y = UByte2.y; return *this; } - XMUBYTE2& operator= (uint16_t Packed) { v = Packed; return *this; } -}; - -//------------------------------------------------------------------------------ -// 3D vector: 5/6/5 unsigned integer components -struct XMU565 -{ - union - { - struct - { - uint16_t x : 5; // 0 to 31 - uint16_t y : 6; // 0 to 63 - uint16_t z : 5; // 0 to 31 - }; - uint16_t v; - }; - - XMU565() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMU565(uint16_t Packed) : v(Packed) {} - XM_CONSTEXPR XMU565(uint8_t _x, uint8_t _y, uint8_t _z) : x(_x), y(_y), z(_z) {} - explicit XMU565(_In_reads_(3) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} - XMU565(float _x, float _y, float _z); - explicit XMU565(_In_reads_(3) const float *pArray); - - operator uint16_t () const { return v; } - - XMU565& operator= (const XMU565& U565) { v = U565.v; return *this; } - XMU565& operator= (uint16_t Packed) { v = Packed; return *this; } -}; - -//------------------------------------------------------------------------------ -// 3D vector: 11/11/10 floating-point components -// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent -// and 6-bit mantissa for x component, a 5-bit biased exponent and -// 6-bit mantissa for y component, a 5-bit biased exponent and a 5-bit -// mantissa for z. The z component is stored in the most significant bits -// and the x component in the least significant bits. No sign bits so -// all partial-precision numbers are positive. -// (Z10Y11X11): [32] ZZZZZzzz zzzYYYYY yyyyyyXX XXXxxxxx [0] -struct XMFLOAT3PK -{ - union - { - struct - { - uint32_t xm : 6; // x-mantissa - uint32_t xe : 5; // x-exponent - uint32_t ym : 6; // y-mantissa - uint32_t ye : 5; // y-exponent - uint32_t zm : 5; // z-mantissa - uint32_t ze : 5; // z-exponent - }; - uint32_t v; - }; - - XMFLOAT3PK() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMFLOAT3PK(uint32_t Packed) : v(Packed) {} - XMFLOAT3PK(float _x, float _y, float _z); - explicit XMFLOAT3PK(_In_reads_(3) const float *pArray); - - operator uint32_t () const { return v; } - - XMFLOAT3PK& operator= (const XMFLOAT3PK& float3pk) { v = float3pk.v; return *this; } - XMFLOAT3PK& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -//------------------------------------------------------------------------------ -// 3D vector: 9/9/9 floating-point components with shared 5-bit exponent -// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent -// with 9-bit mantissa for the x, y, and z component. The shared exponent -// is stored in the most significant bits and the x component mantissa is in -// the least significant bits. No sign bits so all partial-precision numbers -// are positive. -// (E5Z9Y9X9): [32] EEEEEzzz zzzzzzyy yyyyyyyx xxxxxxxx [0] -struct XMFLOAT3SE -{ - union - { - struct - { - uint32_t xm : 9; // x-mantissa - uint32_t ym : 9; // y-mantissa - uint32_t zm : 9; // z-mantissa - uint32_t e : 5; // shared exponent - }; - uint32_t v; - }; - - XMFLOAT3SE() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMFLOAT3SE(uint32_t Packed) : v(Packed) {} - XMFLOAT3SE(float _x, float _y, float _z); - explicit XMFLOAT3SE(_In_reads_(3) const float *pArray); - - operator uint32_t () const { return v; } - - XMFLOAT3SE& operator= (const XMFLOAT3SE& float3se) { v = float3se.v; return *this; } - XMFLOAT3SE& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -//------------------------------------------------------------------------------ -// 4D Vector; 16 bit floating point components -struct XMHALF4 -{ - union - { - struct - { - HALF x; - HALF y; - HALF z; - HALF w; - }; - uint64_t v; - }; - - XMHALF4() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMHALF4(uint64_t Packed) : v(Packed) {} - XM_CONSTEXPR XMHALF4(HALF _x, HALF _y, HALF _z, HALF _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XMHALF4(_In_reads_(4) const HALF *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - XMHALF4(float _x, float _y, float _z, float _w); - explicit XMHALF4(_In_reads_(4) const float *pArray); - - XMHALF4& operator= (const XMHALF4& Half4) { x = Half4.x; y = Half4.y; z = Half4.z; w = Half4.w; return *this; } - XMHALF4& operator= (uint64_t Packed) { v = Packed; return *this; } -}; - -//------------------------------------------------------------------------------ -// 4D Vector; 16 bit signed normalized integer components -struct XMSHORTN4 -{ - union - { - struct - { - int16_t x; - int16_t y; - int16_t z; - int16_t w; - }; - uint64_t v; - }; - - XMSHORTN4() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMSHORTN4(uint64_t Packed) : v(Packed) {} - XM_CONSTEXPR XMSHORTN4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XMSHORTN4(_In_reads_(4) const int16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - XMSHORTN4(float _x, float _y, float _z, float _w); - explicit XMSHORTN4(_In_reads_(4) const float *pArray); - - XMSHORTN4& operator= (const XMSHORTN4& ShortN4) { x = ShortN4.x; y = ShortN4.y; z = ShortN4.z; w = ShortN4.w; return *this; } - XMSHORTN4& operator= (uint64_t Packed) { v = Packed; return *this; } -}; - -// 4D Vector; 16 bit signed integer components -struct XMSHORT4 -{ - union - { - struct - { - int16_t x; - int16_t y; - int16_t z; - int16_t w; - }; - uint64_t v; - }; - - XMSHORT4() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMSHORT4(uint64_t Packed) : v(Packed) {} - XM_CONSTEXPR XMSHORT4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XMSHORT4(_In_reads_(4) const int16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - XMSHORT4(float _x, float _y, float _z, float _w); - explicit XMSHORT4(_In_reads_(4) const float *pArray); - - XMSHORT4& operator= (const XMSHORT4& Short4) { x = Short4.x; y = Short4.y; z = Short4.z; w = Short4.w; return *this; } - XMSHORT4& operator= (uint64_t Packed) { v = Packed; return *this; } -}; - -// 4D Vector; 16 bit unsigned normalized integer components -struct XMUSHORTN4 -{ - union - { - struct - { - uint16_t x; - uint16_t y; - uint16_t z; - uint16_t w; - }; - uint64_t v; - }; - - XMUSHORTN4() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMUSHORTN4(uint64_t Packed) : v(Packed) {} - XM_CONSTEXPR XMUSHORTN4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XMUSHORTN4(_In_reads_(4) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - XMUSHORTN4(float _x, float _y, float _z, float _w); - explicit XMUSHORTN4(_In_reads_(4) const float *pArray); - - XMUSHORTN4& operator= (const XMUSHORTN4& UShortN4) { x = UShortN4.x; y = UShortN4.y; z = UShortN4.z; w = UShortN4.w; return *this; } - XMUSHORTN4& operator= (uint64_t Packed) { v = Packed; return *this; } -}; - -// 4D Vector; 16 bit unsigned integer components -struct XMUSHORT4 -{ - union - { - struct - { - uint16_t x; - uint16_t y; - uint16_t z; - uint16_t w; - }; - uint64_t v; - }; - - XMUSHORT4() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMUSHORT4(uint64_t Packed) : v(Packed) {} - XM_CONSTEXPR XMUSHORT4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XMUSHORT4(_In_reads_(4) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - XMUSHORT4(float _x, float _y, float _z, float _w); - explicit XMUSHORT4(_In_reads_(4) const float *pArray); - - XMUSHORT4& operator= (const XMUSHORT4& UShort4) { x = UShort4.x; y = UShort4.y; z = UShort4.z; w = UShort4.w; return *this; } - XMUSHORT4& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -//------------------------------------------------------------------------------ -// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer -// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, -// normalized integer for the w component and 10 bit signed, normalized -// integers for the z, y, and x components. The w component is stored in the -// most significant bits and the x component in the least significant bits -// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] -struct XMXDECN4 -{ - union - { - struct - { - int32_t x : 10; // -511/511 to 511/511 - int32_t y : 10; // -511/511 to 511/511 - int32_t z : 10; // -511/511 to 511/511 - uint32_t w : 2; // 0/3 to 3/3 - }; - uint32_t v; - }; - - XMXDECN4() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMXDECN4(uint32_t Packed) : v(Packed) {} - XMXDECN4(float _x, float _y, float _z, float _w); - explicit XMXDECN4(_In_reads_(4) const float *pArray); - - operator uint32_t () const { return v; } - - XMXDECN4& operator= (const XMXDECN4& XDecN4) { v = XDecN4.v; return *this; } - XMXDECN4& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer -// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned -// integer for the w component and 10 bit signed integers for the -// z, y, and x components. The w component is stored in the -// most significant bits and the x component in the least significant bits -// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] -struct XM_DEPRECATED XMXDEC4 -{ - union - { - struct - { - int32_t x : 10; // -511 to 511 - int32_t y : 10; // -511 to 511 - int32_t z : 10; // -511 to 511 - uint32_t w : 2; // 0 to 3 - }; - uint32_t v; - }; - - XMXDEC4() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMXDEC4(uint32_t Packed) : v(Packed) {} - XMXDEC4(float _x, float _y, float _z, float _w); - explicit XMXDEC4(_In_reads_(4) const float *pArray); - - operator uint32_t () const { return v; } - - XMXDEC4& operator= (const XMXDEC4& XDec4) { v = XDec4.v; return *this; } - XMXDEC4& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer -// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit signed, -// normalized integer for the w component and 10 bit signed, normalized -// integers for the z, y, and x components. The w component is stored in the -// most significant bits and the x component in the least significant bits -// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] -struct XM_DEPRECATED XMDECN4 -{ - union - { - struct - { - int32_t x : 10; // -511/511 to 511/511 - int32_t y : 10; // -511/511 to 511/511 - int32_t z : 10; // -511/511 to 511/511 - int32_t w : 2; // -1/1 to 1/1 - }; - uint32_t v; - }; - - XMDECN4() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMDECN4(uint32_t Packed) : v(Packed) {} - XMDECN4(float _x, float _y, float _z, float _w); - explicit XMDECN4(_In_reads_(4) const float *pArray); - - operator uint32_t () const { return v; } - - XMDECN4& operator= (const XMDECN4& DecN4) { v = DecN4.v; return *this; } - XMDECN4& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer -// The 4D Vector is packed into 32 bits as follows: a 2 bit signed, -// integer for the w component and 10 bit signed integers for the -// z, y, and x components. The w component is stored in the -// most significant bits and the x component in the least significant bits -// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] -struct XM_DEPRECATED XMDEC4 -{ - union - { - struct - { - int32_t x : 10; // -511 to 511 - int32_t y : 10; // -511 to 511 - int32_t z : 10; // -511 to 511 - int32_t w : 2; // -1 to 1 - }; - uint32_t v; - }; - - XMDEC4() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMDEC4(uint32_t Packed) : v(Packed) {} - XMDEC4(float _x, float _y, float _z, float _w); - explicit XMDEC4(_In_reads_(4) const float *pArray); - - operator uint32_t () const { return v; } - - XMDEC4& operator= (const XMDEC4& Dec4) { v = Dec4.v; return *this; } - XMDEC4& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer -// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, -// normalized integer for the w component and 10 bit unsigned, normalized -// integers for the z, y, and x components. The w component is stored in the -// most significant bits and the x component in the least significant bits -// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] -struct XMUDECN4 -{ - union - { - struct - { - uint32_t x : 10; // 0/1023 to 1023/1023 - uint32_t y : 10; // 0/1023 to 1023/1023 - uint32_t z : 10; // 0/1023 to 1023/1023 - uint32_t w : 2; // 0/3 to 3/3 - }; - uint32_t v; - }; - - XMUDECN4() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMUDECN4(uint32_t Packed) : v(Packed) {} - XMUDECN4(float _x, float _y, float _z, float _w); - explicit XMUDECN4(_In_reads_(4) const float *pArray); - - operator uint32_t () const { return v; } - - XMUDECN4& operator= (const XMUDECN4& UDecN4) { v = UDecN4.v; return *this; } - XMUDECN4& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer -// The 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, -// integer for the w component and 10 bit unsigned integers -// for the z, y, and x components. The w component is stored in the -// most significant bits and the x component in the least significant bits -// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] -struct XMUDEC4 -{ - union - { - struct - { - uint32_t x : 10; // 0 to 1023 - uint32_t y : 10; // 0 to 1023 - uint32_t z : 10; // 0 to 1023 - uint32_t w : 2; // 0 to 3 - }; - uint32_t v; - }; - - XMUDEC4() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMUDEC4(uint32_t Packed) : v(Packed) {} - XMUDEC4(float _x, float _y, float _z, float _w); - explicit XMUDEC4(_In_reads_(4) const float *pArray); - - operator uint32_t () const { return v; } - - XMUDEC4& operator= (const XMUDEC4& UDec4) { v = UDec4.v; return *this; } - XMUDEC4& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -//------------------------------------------------------------------------------ -// 4D Vector; 8 bit signed normalized integer components -struct XMBYTEN4 -{ - union - { - struct - { - int8_t x; - int8_t y; - int8_t z; - int8_t w; - }; - uint32_t v; - }; - - XMBYTEN4() XM_CTOR_DEFAULT - XM_CONSTEXPR XMBYTEN4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XM_CONSTEXPR XMBYTEN4(uint32_t Packed) : v(Packed) {} - explicit XMBYTEN4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - XMBYTEN4(float _x, float _y, float _z, float _w); - explicit XMBYTEN4(_In_reads_(4) const float *pArray); - - XMBYTEN4& operator= (const XMBYTEN4& ByteN4) { x = ByteN4.x; y = ByteN4.y; z = ByteN4.z; w = ByteN4.w; return *this; } - XMBYTEN4& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -// 4D Vector; 8 bit signed integer components -struct XMBYTE4 -{ - union - { - struct - { - int8_t x; - int8_t y; - int8_t z; - int8_t w; - }; - uint32_t v; - }; - - XMBYTE4() XM_CTOR_DEFAULT - XM_CONSTEXPR XMBYTE4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XM_CONSTEXPR XMBYTE4(uint32_t Packed) : v(Packed) {} - explicit XMBYTE4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - XMBYTE4(float _x, float _y, float _z, float _w); - explicit XMBYTE4(_In_reads_(4) const float *pArray); - - XMBYTE4& operator= (const XMBYTE4& Byte4) { x = Byte4.x; y = Byte4.y; z = Byte4.z; w = Byte4.w; return *this; } - XMBYTE4& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -// 4D Vector; 8 bit unsigned normalized integer components -struct XMUBYTEN4 -{ - union - { - struct - { - uint8_t x; - uint8_t y; - uint8_t z; - uint8_t w; - }; - uint32_t v; - }; - - XMUBYTEN4() XM_CTOR_DEFAULT - XM_CONSTEXPR XMUBYTEN4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XM_CONSTEXPR XMUBYTEN4(uint32_t Packed) : v(Packed) {} - explicit XMUBYTEN4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - XMUBYTEN4(float _x, float _y, float _z, float _w); - explicit XMUBYTEN4(_In_reads_(4) const float *pArray); - - XMUBYTEN4& operator= (const XMUBYTEN4& UByteN4) { x = UByteN4.x; y = UByteN4.y; z = UByteN4.z; w = UByteN4.w; return *this; } - XMUBYTEN4& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -// 4D Vector; 8 bit unsigned integer components -struct XMUBYTE4 -{ - union - { - struct - { - uint8_t x; - uint8_t y; - uint8_t z; - uint8_t w; - }; - uint32_t v; - }; - - XMUBYTE4() XM_CTOR_DEFAULT - XM_CONSTEXPR XMUBYTE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XM_CONSTEXPR XMUBYTE4(uint32_t Packed) : v(Packed) {} - explicit XMUBYTE4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - XMUBYTE4(float _x, float _y, float _z, float _w); - explicit XMUBYTE4(_In_reads_(4) const float *pArray); - - XMUBYTE4& operator= (const XMUBYTE4& UByte4) { x = UByte4.x; y = UByte4.y; z = UByte4.z; w = UByte4.w; return *this; } - XMUBYTE4& operator= (uint32_t Packed) { v = Packed; return *this; } -}; - -//------------------------------------------------------------------------------ -// 4D vector; 4 bit unsigned integer components -struct XMUNIBBLE4 -{ - union - { - struct - { - uint16_t x : 4; // 0 to 15 - uint16_t y : 4; // 0 to 15 - uint16_t z : 4; // 0 to 15 - uint16_t w : 4; // 0 to 15 - }; - uint16_t v; - }; - - XMUNIBBLE4() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMUNIBBLE4(uint16_t Packed) : v(Packed) {} - XM_CONSTEXPR XMUNIBBLE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {} - explicit XMUNIBBLE4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} - XMUNIBBLE4(float _x, float _y, float _z, float _w); - explicit XMUNIBBLE4(_In_reads_(4) const float *pArray); - - operator uint16_t () const { return v; } - - XMUNIBBLE4& operator= (const XMUNIBBLE4& UNibble4) { v = UNibble4.v; return *this; } - XMUNIBBLE4& operator= (uint16_t Packed) { v = Packed; return *this; } -}; - -//------------------------------------------------------------------------------ -// 4D vector: 5/5/5/1 unsigned integer components -struct XMU555 -{ - union - { - struct - { - uint16_t x : 5; // 0 to 31 - uint16_t y : 5; // 0 to 31 - uint16_t z : 5; // 0 to 31 - uint16_t w : 1; // 0 or 1 - }; - uint16_t v; - }; - - XMU555() XM_CTOR_DEFAULT - explicit XM_CONSTEXPR XMU555(uint16_t Packed) : v(Packed) {} - XM_CONSTEXPR XMU555(uint8_t _x, uint8_t _y, uint8_t _z, bool _w) : x(_x), y(_y), z(_z), w(_w ? 0x1 : 0) {} - XMU555(_In_reads_(3) const uint8_t *pArray, _In_ bool _w) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(_w ? 0x1 : 0) {} - XMU555(float _x, float _y, float _z, bool _w); - XMU555(_In_reads_(3) const float *pArray, _In_ bool _w); - - operator uint16_t () const { return v; } - - XMU555& operator= (const XMU555& U555) { v = U555.v; return *this; } - XMU555& operator= (uint16_t Packed) { v = Packed; return *this; } -}; - -#pragma warning(pop) - - -/**************************************************************************** - * - * Data conversion operations - * - ****************************************************************************/ - -float XMConvertHalfToFloat(HALF Value); -float* XMConvertHalfToFloatStream(_Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(HALF)+InputStride*(HalfCount-1)) const HALF* pInputStream, - _In_ size_t InputStride, _In_ size_t HalfCount); -HALF XMConvertFloatToHalf(float Value); -HALF* XMConvertFloatToHalfStream(_Out_writes_bytes_(sizeof(HALF)+OutputStride*(FloatCount-1)) HALF* pOutputStream, - _In_ size_t OutputStride, - _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, - _In_ size_t InputStride, _In_ size_t FloatCount); - -/**************************************************************************** - * - * Load operations - * - ****************************************************************************/ - -XMVECTOR XM_CALLCONV XMLoadColor(_In_ const XMCOLOR* pSource); - -XMVECTOR XM_CALLCONV XMLoadHalf2(_In_ const XMHALF2* pSource); -XMVECTOR XM_CALLCONV XMLoadShortN2(_In_ const XMSHORTN2* pSource); -XMVECTOR XM_CALLCONV XMLoadShort2(_In_ const XMSHORT2* pSource); -XMVECTOR XM_CALLCONV XMLoadUShortN2(_In_ const XMUSHORTN2* pSource); -XMVECTOR XM_CALLCONV XMLoadUShort2(_In_ const XMUSHORT2* pSource); -XMVECTOR XM_CALLCONV XMLoadByteN2(_In_ const XMBYTEN2* pSource); -XMVECTOR XM_CALLCONV XMLoadByte2(_In_ const XMBYTE2* pSource); -XMVECTOR XM_CALLCONV XMLoadUByteN2(_In_ const XMUBYTEN2* pSource); -XMVECTOR XM_CALLCONV XMLoadUByte2(_In_ const XMUBYTE2* pSource); - -XMVECTOR XM_CALLCONV XMLoadU565(_In_ const XMU565* pSource); -XMVECTOR XM_CALLCONV XMLoadFloat3PK(_In_ const XMFLOAT3PK* pSource); -XMVECTOR XM_CALLCONV XMLoadFloat3SE(_In_ const XMFLOAT3SE* pSource); - -XMVECTOR XM_CALLCONV XMLoadHalf4(_In_ const XMHALF4* pSource); -XMVECTOR XM_CALLCONV XMLoadShortN4(_In_ const XMSHORTN4* pSource); -XMVECTOR XM_CALLCONV XMLoadShort4(_In_ const XMSHORT4* pSource); -XMVECTOR XM_CALLCONV XMLoadUShortN4(_In_ const XMUSHORTN4* pSource); -XMVECTOR XM_CALLCONV XMLoadUShort4(_In_ const XMUSHORT4* pSource); -XMVECTOR XM_CALLCONV XMLoadXDecN4(_In_ const XMXDECN4* pSource); -XMVECTOR XM_CALLCONV XMLoadUDecN4(_In_ const XMUDECN4* pSource); -XMVECTOR XM_CALLCONV XMLoadUDecN4_XR(_In_ const XMUDECN4* pSource); -XMVECTOR XM_CALLCONV XMLoadUDec4(_In_ const XMUDEC4* pSource); -XMVECTOR XM_CALLCONV XMLoadByteN4(_In_ const XMBYTEN4* pSource); -XMVECTOR XM_CALLCONV XMLoadByte4(_In_ const XMBYTE4* pSource); -XMVECTOR XM_CALLCONV XMLoadUByteN4(_In_ const XMUBYTEN4* pSource); -XMVECTOR XM_CALLCONV XMLoadUByte4(_In_ const XMUBYTE4* pSource); -XMVECTOR XM_CALLCONV XMLoadUNibble4(_In_ const XMUNIBBLE4* pSource); -XMVECTOR XM_CALLCONV XMLoadU555(_In_ const XMU555* pSource); - -#pragma warning(push) -#pragma warning(disable : 4996) -// C4996: ignore deprecation warning - -XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadDecN4(_In_ const XMDECN4* pSource); -XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadDec4(_In_ const XMDEC4* pSource); -XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadXDec4(_In_ const XMXDEC4* pSource); -#pragma warning(pop) - -/**************************************************************************** - * - * Store operations - * - ****************************************************************************/ - -void XM_CALLCONV XMStoreColor(_Out_ XMCOLOR* pDestination, _In_ FXMVECTOR V); - -void XM_CALLCONV XMStoreHalf2(_Out_ XMHALF2* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreShortN2(_Out_ XMSHORTN2* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreShort2(_Out_ XMSHORT2* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUShortN2(_Out_ XMUSHORTN2* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUShort2(_Out_ XMUSHORT2* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreByteN2(_Out_ XMBYTEN2* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreByte2(_Out_ XMBYTE2* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUByteN2(_Out_ XMUBYTEN2* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUByte2(_Out_ XMUBYTE2* pDestination, _In_ FXMVECTOR V); - -void XM_CALLCONV XMStoreU565(_Out_ XMU565* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreFloat3PK(_Out_ XMFLOAT3PK* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreFloat3SE(_Out_ XMFLOAT3SE* pDestination, _In_ FXMVECTOR V); - -void XM_CALLCONV XMStoreHalf4(_Out_ XMHALF4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreShortN4(_Out_ XMSHORTN4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreShort4(_Out_ XMSHORT4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUShortN4(_Out_ XMUSHORTN4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUShort4(_Out_ XMUSHORT4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreXDecN4(_Out_ XMXDECN4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUDecN4(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUDecN4_XR(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUDec4(_Out_ XMUDEC4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreByteN4(_Out_ XMBYTEN4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreByte4(_Out_ XMBYTE4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUByteN4(_Out_ XMUBYTEN4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUByte4(_Out_ XMUBYTE4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreUNibble4(_Out_ XMUNIBBLE4* pDestination, _In_ FXMVECTOR V); -void XM_CALLCONV XMStoreU555(_Out_ XMU555* pDestination, _In_ FXMVECTOR V); - -#pragma warning(push) -#pragma warning(disable : 4996) -// C4996: ignore deprecation warning - -void XM_DEPRECATED XM_CALLCONV XMStoreDecN4(_Out_ XMDECN4* pDestination, _In_ FXMVECTOR V); -void XM_DEPRECATED XM_CALLCONV XMStoreDec4(_Out_ XMDEC4* pDestination, _In_ FXMVECTOR V); -void XM_DEPRECATED XM_CALLCONV XMStoreXDec4(_Out_ XMXDEC4* pDestination, _In_ FXMVECTOR V); -#pragma warning(pop) - -/**************************************************************************** - * - * Implementation - * - ****************************************************************************/ - -#pragma warning(push) -#pragma warning(disable:4068 4214 4204 4365 4616 6001 6101) -// C4068/4616: ignore unknown pragmas -// C4214/4204: nonstandard extension used -// C4365: Off by default noise -// C6001/6101: False positives - -#pragma prefast(push) -#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") - -#include "DirectXPackedVector.inl" - -#pragma prefast(pop) -#pragma warning(pop) - -}; // namespace PackedVector - -}; // namespace DirectX - +//------------------------------------------------------------------------------------- +// DirectXPackedVector.h -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +#include "DirectXMath.h" + +namespace DirectX +{ + +namespace PackedVector +{ + +#pragma warning(push) +#pragma warning(disable:4201 4365 4324) +// C4201: nonstandard extension used +// C4365: Off by default noise +// C4324: alignment padding warnings + +//------------------------------------------------------------------------------ +// ARGB Color; 8-8-8-8 bit unsigned normalized integer components packed into +// a 32 bit integer. The normalized color is packed into 32 bits using 8 bit +// unsigned, normalized integers for the alpha, red, green, and blue components. +// The alpha component is stored in the most significant bits and the blue +// component in the least significant bits (A8R8G8B8): +// [32] aaaaaaaa rrrrrrrr gggggggg bbbbbbbb [0] +struct XMCOLOR +{ + union + { + struct + { + uint8_t b; // Blue: 0/255 to 255/255 + uint8_t g; // Green: 0/255 to 255/255 + uint8_t r; // Red: 0/255 to 255/255 + uint8_t a; // Alpha: 0/255 to 255/255 + }; + uint32_t c; + }; + + XMCOLOR() XM_CTOR_DEFAULT + XM_CONSTEXPR XMCOLOR(uint32_t Color) : c(Color) {} + XMCOLOR(float _r, float _g, float _b, float _a); + explicit XMCOLOR(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return c; } + + XMCOLOR& operator= (const XMCOLOR& Color) { c = Color.c; return *this; } + XMCOLOR& operator= (const uint32_t Color) { c = Color; return *this; } +}; + +//------------------------------------------------------------------------------ +// 16 bit floating point number consisting of a sign bit, a 5 bit biased +// exponent, and a 10 bit mantissa +typedef uint16_t HALF; + +//------------------------------------------------------------------------------ +// 2D Vector; 16 bit floating point components +struct XMHALF2 +{ + union + { + struct + { + HALF x; + HALF y; + }; + uint32_t v; + }; + + XMHALF2() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMHALF2(uint32_t Packed) : v(Packed) {} + XM_CONSTEXPR XMHALF2(HALF _x, HALF _y) : x(_x), y(_y) {} + explicit XMHALF2(_In_reads_(2) const HALF *pArray) : x(pArray[0]), y(pArray[1]) {} + XMHALF2(float _x, float _y); + explicit XMHALF2(_In_reads_(2) const float *pArray); + + XMHALF2& operator= (const XMHALF2& Half2) { x = Half2.x; y = Half2.y; return *this; } + XMHALF2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 2D Vector; 16 bit signed normalized integer components +struct XMSHORTN2 +{ + union + { + struct + { + int16_t x; + int16_t y; + }; + uint32_t v; + }; + + XMSHORTN2() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMSHORTN2(uint32_t Packed) : v(Packed) {} + XM_CONSTEXPR XMSHORTN2(int16_t _x, int16_t _y) : x(_x), y(_y) {} + explicit XMSHORTN2(_In_reads_(2) const int16_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMSHORTN2(float _x, float _y); + explicit XMSHORTN2(_In_reads_(2) const float *pArray); + + XMSHORTN2& operator= (const XMSHORTN2& ShortN2) { x = ShortN2.x; y = ShortN2.y; return *this; } + XMSHORTN2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 16 bit signed integer components +struct XMSHORT2 +{ + union + { + struct + { + int16_t x; + int16_t y; + }; + uint32_t v; + }; + + XMSHORT2() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMSHORT2(uint32_t Packed) : v(Packed) {} + XM_CONSTEXPR XMSHORT2(int16_t _x, int16_t _y) : x(_x), y(_y) {} + explicit XMSHORT2(_In_reads_(2) const int16_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMSHORT2(float _x, float _y); + explicit XMSHORT2(_In_reads_(2) const float *pArray); + + XMSHORT2& operator= (const XMSHORT2& Short2) { x = Short2.x; y = Short2.y; return *this; } + XMSHORT2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 16 bit unsigned normalized integer components +struct XMUSHORTN2 +{ + union + { + struct + { + uint16_t x; + uint16_t y; + }; + uint32_t v; + }; + + XMUSHORTN2() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMUSHORTN2(uint32_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUSHORTN2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {} + explicit XMUSHORTN2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMUSHORTN2(float _x, float _y); + explicit XMUSHORTN2(_In_reads_(2) const float *pArray); + + XMUSHORTN2& operator= (const XMUSHORTN2& UShortN2) { x = UShortN2.x; y = UShortN2.y; return *this; } + XMUSHORTN2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 16 bit unsigned integer components +struct XMUSHORT2 +{ + union + { + struct + { + uint16_t x; + uint16_t y; + }; + uint32_t v; + }; + + XMUSHORT2() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMUSHORT2(uint32_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUSHORT2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {} + explicit XMUSHORT2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMUSHORT2(float _x, float _y); + explicit XMUSHORT2(_In_reads_(2) const float *pArray); + + XMUSHORT2& operator= (const XMUSHORT2& UShort2) { x = UShort2.x; y = UShort2.y; return *this; } + XMUSHORT2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 2D Vector; 8 bit signed normalized integer components +struct XMBYTEN2 +{ + union + { + struct + { + int8_t x; + int8_t y; + }; + uint16_t v; + }; + + XMBYTEN2() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMBYTEN2(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMBYTEN2(int8_t _x, int8_t _y) : x(_x), y(_y) {} + explicit XMBYTEN2(_In_reads_(2) const int8_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMBYTEN2(float _x, float _y); + explicit XMBYTEN2(_In_reads_(2) const float *pArray); + + XMBYTEN2& operator= (const XMBYTEN2& ByteN2) { x = ByteN2.x; y = ByteN2.y; return *this; } + XMBYTEN2& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 8 bit signed integer components +struct XMBYTE2 +{ + union + { + struct + { + int8_t x; + int8_t y; + }; + uint16_t v; + }; + + XMBYTE2() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMBYTE2(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMBYTE2(int8_t _x, int8_t _y) : x(_x), y(_y) {} + explicit XMBYTE2(_In_reads_(2) const int8_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMBYTE2(float _x, float _y); + explicit XMBYTE2(_In_reads_(2) const float *pArray); + + XMBYTE2& operator= (const XMBYTE2& Byte2) { x = Byte2.x; y = Byte2.y; return *this; } + XMBYTE2& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 8 bit unsigned normalized integer components +struct XMUBYTEN2 +{ + union + { + struct + { + uint8_t x; + uint8_t y; + }; + uint16_t v; + }; + + XMUBYTEN2() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMUBYTEN2(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUBYTEN2(uint8_t _x, uint8_t _y) : x(_x), y(_y) {} + explicit XMUBYTEN2(_In_reads_(2) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMUBYTEN2(float _x, float _y); + explicit XMUBYTEN2(_In_reads_(2) const float *pArray); + + XMUBYTEN2& operator= (const XMUBYTEN2& UByteN2) { x = UByteN2.x; y = UByteN2.y; return *this; } + XMUBYTEN2& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 8 bit unsigned integer components +struct XMUBYTE2 +{ + union + { + struct + { + uint8_t x; + uint8_t y; + }; + uint16_t v; + }; + + XMUBYTE2() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMUBYTE2(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUBYTE2(uint8_t _x, uint8_t _y) : x(_x), y(_y) {} + explicit XMUBYTE2(_In_reads_(2) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMUBYTE2(float _x, float _y); + explicit XMUBYTE2(_In_reads_(2) const float *pArray); + + XMUBYTE2& operator= (const XMUBYTE2& UByte2) { x = UByte2.x; y = UByte2.y; return *this; } + XMUBYTE2& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D vector: 5/6/5 unsigned integer components +struct XMU565 +{ + union + { + struct + { + uint16_t x : 5; // 0 to 31 + uint16_t y : 6; // 0 to 63 + uint16_t z : 5; // 0 to 31 + }; + uint16_t v; + }; + + XMU565() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMU565(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMU565(uint8_t _x, uint8_t _y, uint8_t _z) : x(_x), y(_y), z(_z) {} + explicit XMU565(_In_reads_(3) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + XMU565(float _x, float _y, float _z); + explicit XMU565(_In_reads_(3) const float *pArray); + + operator uint16_t () const { return v; } + + XMU565& operator= (const XMU565& U565) { v = U565.v; return *this; } + XMU565& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D vector: 11/11/10 floating-point components +// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent +// and 6-bit mantissa for x component, a 5-bit biased exponent and +// 6-bit mantissa for y component, a 5-bit biased exponent and a 5-bit +// mantissa for z. The z component is stored in the most significant bits +// and the x component in the least significant bits. No sign bits so +// all partial-precision numbers are positive. +// (Z10Y11X11): [32] ZZZZZzzz zzzYYYYY yyyyyyXX XXXxxxxx [0] +struct XMFLOAT3PK +{ + union + { + struct + { + uint32_t xm : 6; // x-mantissa + uint32_t xe : 5; // x-exponent + uint32_t ym : 6; // y-mantissa + uint32_t ye : 5; // y-exponent + uint32_t zm : 5; // z-mantissa + uint32_t ze : 5; // z-exponent + }; + uint32_t v; + }; + + XMFLOAT3PK() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMFLOAT3PK(uint32_t Packed) : v(Packed) {} + XMFLOAT3PK(float _x, float _y, float _z); + explicit XMFLOAT3PK(_In_reads_(3) const float *pArray); + + operator uint32_t () const { return v; } + + XMFLOAT3PK& operator= (const XMFLOAT3PK& float3pk) { v = float3pk.v; return *this; } + XMFLOAT3PK& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D vector: 9/9/9 floating-point components with shared 5-bit exponent +// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent +// with 9-bit mantissa for the x, y, and z component. The shared exponent +// is stored in the most significant bits and the x component mantissa is in +// the least significant bits. No sign bits so all partial-precision numbers +// are positive. +// (E5Z9Y9X9): [32] EEEEEzzz zzzzzzyy yyyyyyyx xxxxxxxx [0] +struct XMFLOAT3SE +{ + union + { + struct + { + uint32_t xm : 9; // x-mantissa + uint32_t ym : 9; // y-mantissa + uint32_t zm : 9; // z-mantissa + uint32_t e : 5; // shared exponent + }; + uint32_t v; + }; + + XMFLOAT3SE() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMFLOAT3SE(uint32_t Packed) : v(Packed) {} + XMFLOAT3SE(float _x, float _y, float _z); + explicit XMFLOAT3SE(_In_reads_(3) const float *pArray); + + operator uint32_t () const { return v; } + + XMFLOAT3SE& operator= (const XMFLOAT3SE& float3se) { v = float3se.v; return *this; } + XMFLOAT3SE& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 16 bit floating point components +struct XMHALF4 +{ + union + { + struct + { + HALF x; + HALF y; + HALF z; + HALF w; + }; + uint64_t v; + }; + + XMHALF4() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMHALF4(uint64_t Packed) : v(Packed) {} + XM_CONSTEXPR XMHALF4(HALF _x, HALF _y, HALF _z, HALF _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMHALF4(_In_reads_(4) const HALF *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMHALF4(float _x, float _y, float _z, float _w); + explicit XMHALF4(_In_reads_(4) const float *pArray); + + XMHALF4& operator= (const XMHALF4& Half4) { x = Half4.x; y = Half4.y; z = Half4.z; w = Half4.w; return *this; } + XMHALF4& operator= (uint64_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 16 bit signed normalized integer components +struct XMSHORTN4 +{ + union + { + struct + { + int16_t x; + int16_t y; + int16_t z; + int16_t w; + }; + uint64_t v; + }; + + XMSHORTN4() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMSHORTN4(uint64_t Packed) : v(Packed) {} + XM_CONSTEXPR XMSHORTN4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMSHORTN4(_In_reads_(4) const int16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMSHORTN4(float _x, float _y, float _z, float _w); + explicit XMSHORTN4(_In_reads_(4) const float *pArray); + + XMSHORTN4& operator= (const XMSHORTN4& ShortN4) { x = ShortN4.x; y = ShortN4.y; z = ShortN4.z; w = ShortN4.w; return *this; } + XMSHORTN4& operator= (uint64_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 16 bit signed integer components +struct XMSHORT4 +{ + union + { + struct + { + int16_t x; + int16_t y; + int16_t z; + int16_t w; + }; + uint64_t v; + }; + + XMSHORT4() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMSHORT4(uint64_t Packed) : v(Packed) {} + XM_CONSTEXPR XMSHORT4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMSHORT4(_In_reads_(4) const int16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMSHORT4(float _x, float _y, float _z, float _w); + explicit XMSHORT4(_In_reads_(4) const float *pArray); + + XMSHORT4& operator= (const XMSHORT4& Short4) { x = Short4.x; y = Short4.y; z = Short4.z; w = Short4.w; return *this; } + XMSHORT4& operator= (uint64_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 16 bit unsigned normalized integer components +struct XMUSHORTN4 +{ + union + { + struct + { + uint16_t x; + uint16_t y; + uint16_t z; + uint16_t w; + }; + uint64_t v; + }; + + XMUSHORTN4() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMUSHORTN4(uint64_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUSHORTN4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUSHORTN4(_In_reads_(4) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUSHORTN4(float _x, float _y, float _z, float _w); + explicit XMUSHORTN4(_In_reads_(4) const float *pArray); + + XMUSHORTN4& operator= (const XMUSHORTN4& UShortN4) { x = UShortN4.x; y = UShortN4.y; z = UShortN4.z; w = UShortN4.w; return *this; } + XMUSHORTN4& operator= (uint64_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 16 bit unsigned integer components +struct XMUSHORT4 +{ + union + { + struct + { + uint16_t x; + uint16_t y; + uint16_t z; + uint16_t w; + }; + uint64_t v; + }; + + XMUSHORT4() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMUSHORT4(uint64_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUSHORT4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUSHORT4(_In_reads_(4) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUSHORT4(float _x, float _y, float _z, float _w); + explicit XMUSHORT4(_In_reads_(4) const float *pArray); + + XMUSHORT4& operator= (const XMUSHORT4& UShort4) { x = UShort4.x; y = UShort4.y; z = UShort4.z; w = UShort4.w; return *this; } + XMUSHORT4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, +// normalized integer for the w component and 10 bit signed, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XMXDECN4 +{ + union + { + struct + { + int32_t x : 10; // -511/511 to 511/511 + int32_t y : 10; // -511/511 to 511/511 + int32_t z : 10; // -511/511 to 511/511 + uint32_t w : 2; // 0/3 to 3/3 + }; + uint32_t v; + }; + + XMXDECN4() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMXDECN4(uint32_t Packed) : v(Packed) {} + XMXDECN4(float _x, float _y, float _z, float _w); + explicit XMXDECN4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMXDECN4& operator= (const XMXDECN4& XDecN4) { v = XDecN4.v; return *this; } + XMXDECN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned +// integer for the w component and 10 bit signed integers for the +// z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XM_DEPRECATED XMXDEC4 +{ + union + { + struct + { + int32_t x : 10; // -511 to 511 + int32_t y : 10; // -511 to 511 + int32_t z : 10; // -511 to 511 + uint32_t w : 2; // 0 to 3 + }; + uint32_t v; + }; + + XMXDEC4() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMXDEC4(uint32_t Packed) : v(Packed) {} + XMXDEC4(float _x, float _y, float _z, float _w); + explicit XMXDEC4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMXDEC4& operator= (const XMXDEC4& XDec4) { v = XDec4.v; return *this; } + XMXDEC4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit signed, +// normalized integer for the w component and 10 bit signed, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XM_DEPRECATED XMDECN4 +{ + union + { + struct + { + int32_t x : 10; // -511/511 to 511/511 + int32_t y : 10; // -511/511 to 511/511 + int32_t z : 10; // -511/511 to 511/511 + int32_t w : 2; // -1/1 to 1/1 + }; + uint32_t v; + }; + + XMDECN4() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMDECN4(uint32_t Packed) : v(Packed) {} + XMDECN4(float _x, float _y, float _z, float _w); + explicit XMDECN4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMDECN4& operator= (const XMDECN4& DecN4) { v = DecN4.v; return *this; } + XMDECN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer +// The 4D Vector is packed into 32 bits as follows: a 2 bit signed, +// integer for the w component and 10 bit signed integers for the +// z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XM_DEPRECATED XMDEC4 +{ + union + { + struct + { + int32_t x : 10; // -511 to 511 + int32_t y : 10; // -511 to 511 + int32_t z : 10; // -511 to 511 + int32_t w : 2; // -1 to 1 + }; + uint32_t v; + }; + + XMDEC4() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMDEC4(uint32_t Packed) : v(Packed) {} + XMDEC4(float _x, float _y, float _z, float _w); + explicit XMDEC4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMDEC4& operator= (const XMDEC4& Dec4) { v = Dec4.v; return *this; } + XMDEC4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, +// normalized integer for the w component and 10 bit unsigned, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XMUDECN4 +{ + union + { + struct + { + uint32_t x : 10; // 0/1023 to 1023/1023 + uint32_t y : 10; // 0/1023 to 1023/1023 + uint32_t z : 10; // 0/1023 to 1023/1023 + uint32_t w : 2; // 0/3 to 3/3 + }; + uint32_t v; + }; + + XMUDECN4() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMUDECN4(uint32_t Packed) : v(Packed) {} + XMUDECN4(float _x, float _y, float _z, float _w); + explicit XMUDECN4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMUDECN4& operator= (const XMUDECN4& UDecN4) { v = UDecN4.v; return *this; } + XMUDECN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer +// The 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, +// integer for the w component and 10 bit unsigned integers +// for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XMUDEC4 +{ + union + { + struct + { + uint32_t x : 10; // 0 to 1023 + uint32_t y : 10; // 0 to 1023 + uint32_t z : 10; // 0 to 1023 + uint32_t w : 2; // 0 to 3 + }; + uint32_t v; + }; + + XMUDEC4() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMUDEC4(uint32_t Packed) : v(Packed) {} + XMUDEC4(float _x, float _y, float _z, float _w); + explicit XMUDEC4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMUDEC4& operator= (const XMUDEC4& UDec4) { v = UDec4.v; return *this; } + XMUDEC4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 8 bit signed normalized integer components +struct XMBYTEN4 +{ + union + { + struct + { + int8_t x; + int8_t y; + int8_t z; + int8_t w; + }; + uint32_t v; + }; + + XMBYTEN4() XM_CTOR_DEFAULT + XM_CONSTEXPR XMBYTEN4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XM_CONSTEXPR XMBYTEN4(uint32_t Packed) : v(Packed) {} + explicit XMBYTEN4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMBYTEN4(float _x, float _y, float _z, float _w); + explicit XMBYTEN4(_In_reads_(4) const float *pArray); + + XMBYTEN4& operator= (const XMBYTEN4& ByteN4) { x = ByteN4.x; y = ByteN4.y; z = ByteN4.z; w = ByteN4.w; return *this; } + XMBYTEN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 8 bit signed integer components +struct XMBYTE4 +{ + union + { + struct + { + int8_t x; + int8_t y; + int8_t z; + int8_t w; + }; + uint32_t v; + }; + + XMBYTE4() XM_CTOR_DEFAULT + XM_CONSTEXPR XMBYTE4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XM_CONSTEXPR XMBYTE4(uint32_t Packed) : v(Packed) {} + explicit XMBYTE4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMBYTE4(float _x, float _y, float _z, float _w); + explicit XMBYTE4(_In_reads_(4) const float *pArray); + + XMBYTE4& operator= (const XMBYTE4& Byte4) { x = Byte4.x; y = Byte4.y; z = Byte4.z; w = Byte4.w; return *this; } + XMBYTE4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 8 bit unsigned normalized integer components +struct XMUBYTEN4 +{ + union + { + struct + { + uint8_t x; + uint8_t y; + uint8_t z; + uint8_t w; + }; + uint32_t v; + }; + + XMUBYTEN4() XM_CTOR_DEFAULT + XM_CONSTEXPR XMUBYTEN4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XM_CONSTEXPR XMUBYTEN4(uint32_t Packed) : v(Packed) {} + explicit XMUBYTEN4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUBYTEN4(float _x, float _y, float _z, float _w); + explicit XMUBYTEN4(_In_reads_(4) const float *pArray); + + XMUBYTEN4& operator= (const XMUBYTEN4& UByteN4) { x = UByteN4.x; y = UByteN4.y; z = UByteN4.z; w = UByteN4.w; return *this; } + XMUBYTEN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 8 bit unsigned integer components +struct XMUBYTE4 +{ + union + { + struct + { + uint8_t x; + uint8_t y; + uint8_t z; + uint8_t w; + }; + uint32_t v; + }; + + XMUBYTE4() XM_CTOR_DEFAULT + XM_CONSTEXPR XMUBYTE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XM_CONSTEXPR XMUBYTE4(uint32_t Packed) : v(Packed) {} + explicit XMUBYTE4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUBYTE4(float _x, float _y, float _z, float _w); + explicit XMUBYTE4(_In_reads_(4) const float *pArray); + + XMUBYTE4& operator= (const XMUBYTE4& UByte4) { x = UByte4.x; y = UByte4.y; z = UByte4.z; w = UByte4.w; return *this; } + XMUBYTE4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D vector; 4 bit unsigned integer components +struct XMUNIBBLE4 +{ + union + { + struct + { + uint16_t x : 4; // 0 to 15 + uint16_t y : 4; // 0 to 15 + uint16_t z : 4; // 0 to 15 + uint16_t w : 4; // 0 to 15 + }; + uint16_t v; + }; + + XMUNIBBLE4() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMUNIBBLE4(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMUNIBBLE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUNIBBLE4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUNIBBLE4(float _x, float _y, float _z, float _w); + explicit XMUNIBBLE4(_In_reads_(4) const float *pArray); + + operator uint16_t () const { return v; } + + XMUNIBBLE4& operator= (const XMUNIBBLE4& UNibble4) { v = UNibble4.v; return *this; } + XMUNIBBLE4& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D vector: 5/5/5/1 unsigned integer components +struct XMU555 +{ + union + { + struct + { + uint16_t x : 5; // 0 to 31 + uint16_t y : 5; // 0 to 31 + uint16_t z : 5; // 0 to 31 + uint16_t w : 1; // 0 or 1 + }; + uint16_t v; + }; + + XMU555() XM_CTOR_DEFAULT + explicit XM_CONSTEXPR XMU555(uint16_t Packed) : v(Packed) {} + XM_CONSTEXPR XMU555(uint8_t _x, uint8_t _y, uint8_t _z, bool _w) : x(_x), y(_y), z(_z), w(_w ? 0x1 : 0) {} + XMU555(_In_reads_(3) const uint8_t *pArray, _In_ bool _w) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(_w ? 0x1 : 0) {} + XMU555(float _x, float _y, float _z, bool _w); + XMU555(_In_reads_(3) const float *pArray, _In_ bool _w); + + operator uint16_t () const { return v; } + + XMU555& operator= (const XMU555& U555) { v = U555.v; return *this; } + XMU555& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +#pragma warning(pop) + + +/**************************************************************************** + * + * Data conversion operations + * + ****************************************************************************/ + +float XMConvertHalfToFloat(HALF Value); +float* XMConvertHalfToFloatStream(_Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(HALF)+InputStride*(HalfCount-1)) const HALF* pInputStream, + _In_ size_t InputStride, _In_ size_t HalfCount); +HALF XMConvertFloatToHalf(float Value); +HALF* XMConvertFloatToHalfStream(_Out_writes_bytes_(sizeof(HALF)+OutputStride*(FloatCount-1)) HALF* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, + _In_ size_t InputStride, _In_ size_t FloatCount); + +/**************************************************************************** + * + * Load operations + * + ****************************************************************************/ + +XMVECTOR XM_CALLCONV XMLoadColor(_In_ const XMCOLOR* pSource); + +XMVECTOR XM_CALLCONV XMLoadHalf2(_In_ const XMHALF2* pSource); +XMVECTOR XM_CALLCONV XMLoadShortN2(_In_ const XMSHORTN2* pSource); +XMVECTOR XM_CALLCONV XMLoadShort2(_In_ const XMSHORT2* pSource); +XMVECTOR XM_CALLCONV XMLoadUShortN2(_In_ const XMUSHORTN2* pSource); +XMVECTOR XM_CALLCONV XMLoadUShort2(_In_ const XMUSHORT2* pSource); +XMVECTOR XM_CALLCONV XMLoadByteN2(_In_ const XMBYTEN2* pSource); +XMVECTOR XM_CALLCONV XMLoadByte2(_In_ const XMBYTE2* pSource); +XMVECTOR XM_CALLCONV XMLoadUByteN2(_In_ const XMUBYTEN2* pSource); +XMVECTOR XM_CALLCONV XMLoadUByte2(_In_ const XMUBYTE2* pSource); + +XMVECTOR XM_CALLCONV XMLoadU565(_In_ const XMU565* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat3PK(_In_ const XMFLOAT3PK* pSource); +XMVECTOR XM_CALLCONV XMLoadFloat3SE(_In_ const XMFLOAT3SE* pSource); + +XMVECTOR XM_CALLCONV XMLoadHalf4(_In_ const XMHALF4* pSource); +XMVECTOR XM_CALLCONV XMLoadShortN4(_In_ const XMSHORTN4* pSource); +XMVECTOR XM_CALLCONV XMLoadShort4(_In_ const XMSHORT4* pSource); +XMVECTOR XM_CALLCONV XMLoadUShortN4(_In_ const XMUSHORTN4* pSource); +XMVECTOR XM_CALLCONV XMLoadUShort4(_In_ const XMUSHORT4* pSource); +XMVECTOR XM_CALLCONV XMLoadXDecN4(_In_ const XMXDECN4* pSource); +XMVECTOR XM_CALLCONV XMLoadUDecN4(_In_ const XMUDECN4* pSource); +XMVECTOR XM_CALLCONV XMLoadUDecN4_XR(_In_ const XMUDECN4* pSource); +XMVECTOR XM_CALLCONV XMLoadUDec4(_In_ const XMUDEC4* pSource); +XMVECTOR XM_CALLCONV XMLoadByteN4(_In_ const XMBYTEN4* pSource); +XMVECTOR XM_CALLCONV XMLoadByte4(_In_ const XMBYTE4* pSource); +XMVECTOR XM_CALLCONV XMLoadUByteN4(_In_ const XMUBYTEN4* pSource); +XMVECTOR XM_CALLCONV XMLoadUByte4(_In_ const XMUBYTE4* pSource); +XMVECTOR XM_CALLCONV XMLoadUNibble4(_In_ const XMUNIBBLE4* pSource); +XMVECTOR XM_CALLCONV XMLoadU555(_In_ const XMU555* pSource); + +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning + +XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadDecN4(_In_ const XMDECN4* pSource); +XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadDec4(_In_ const XMDEC4* pSource); +XMVECTOR XM_DEPRECATED XM_CALLCONV XMLoadXDec4(_In_ const XMXDEC4* pSource); +#pragma warning(pop) + +/**************************************************************************** + * + * Store operations + * + ****************************************************************************/ + +void XM_CALLCONV XMStoreColor(_Out_ XMCOLOR* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreHalf2(_Out_ XMHALF2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreShortN2(_Out_ XMSHORTN2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreShort2(_Out_ XMSHORT2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUShortN2(_Out_ XMUSHORTN2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUShort2(_Out_ XMUSHORT2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreByteN2(_Out_ XMBYTEN2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreByte2(_Out_ XMBYTE2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUByteN2(_Out_ XMUBYTEN2* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUByte2(_Out_ XMUBYTE2* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreU565(_Out_ XMU565* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat3PK(_Out_ XMFLOAT3PK* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreFloat3SE(_Out_ XMFLOAT3SE* pDestination, _In_ FXMVECTOR V); + +void XM_CALLCONV XMStoreHalf4(_Out_ XMHALF4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreShortN4(_Out_ XMSHORTN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreShort4(_Out_ XMSHORT4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUShortN4(_Out_ XMUSHORTN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUShort4(_Out_ XMUSHORT4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreXDecN4(_Out_ XMXDECN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUDecN4(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUDecN4_XR(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUDec4(_Out_ XMUDEC4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreByteN4(_Out_ XMBYTEN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreByte4(_Out_ XMBYTE4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUByteN4(_Out_ XMUBYTEN4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUByte4(_Out_ XMUBYTE4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreUNibble4(_Out_ XMUNIBBLE4* pDestination, _In_ FXMVECTOR V); +void XM_CALLCONV XMStoreU555(_Out_ XMU555* pDestination, _In_ FXMVECTOR V); + +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning + +void XM_DEPRECATED XM_CALLCONV XMStoreDecN4(_Out_ XMDECN4* pDestination, _In_ FXMVECTOR V); +void XM_DEPRECATED XM_CALLCONV XMStoreDec4(_Out_ XMDEC4* pDestination, _In_ FXMVECTOR V); +void XM_DEPRECATED XM_CALLCONV XMStoreXDec4(_Out_ XMXDEC4* pDestination, _In_ FXMVECTOR V); +#pragma warning(pop) + +/**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable:4068 4214 4204 4365 4616 6001 6101) +// C4068/4616: ignore unknown pragmas +// C4214/4204: nonstandard extension used +// C4365: Off by default noise +// C6001/6101: False positives + +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") + +#include "DirectXPackedVector.inl" + +#pragma prefast(pop) +#pragma warning(pop) + +}; // namespace PackedVector + +}; // namespace DirectX + diff --git a/Inc/DirectXPackedVector.inl b/Inc/DirectXPackedVector.inl index 4713db8..b60eafd 100644 --- a/Inc/DirectXPackedVector.inl +++ b/Inc/DirectXPackedVector.inl @@ -1,4368 +1,4368 @@ -//------------------------------------------------------------------------------------- -// DirectXPackedVector.inl -- SIMD C++ Math library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615560 -//------------------------------------------------------------------------------------- - -#pragma once - -/**************************************************************************** - * - * Data conversion - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline float PackedVector::XMConvertHalfToFloat -( - HALF Value -) -{ -#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - __m128i V1 = _mm_cvtsi32_si128( static_cast(Value) ); - __m128 V2 = _mm_cvtph_ps( V1 ); - return _mm_cvtss_f32( V2 ); -#else - uint32_t Mantissa = (uint32_t)(Value & 0x03FF); - - uint32_t Exponent = (Value & 0x7C00); - if ( Exponent == 0x7C00 ) // INF/NAN - { - Exponent = (uint32_t)0x8f; - } - else if (Exponent != 0) // The value is normalized - { - Exponent = (uint32_t)((Value >> 10) & 0x1F); - } - else if (Mantissa != 0) // The value is denormalized - { - // Normalize the value in the resulting float - Exponent = 1; - - do - { - Exponent--; - Mantissa <<= 1; - } while ((Mantissa & 0x0400) == 0); - - Mantissa &= 0x03FF; - } - else // The value is zero - { - Exponent = (uint32_t)-112; - } - - uint32_t Result = ((Value & 0x8000) << 16) | // Sign - ((Exponent + 112) << 23) | // Exponent - (Mantissa << 13); // Mantissa - - return reinterpret_cast(&Result)[0]; -#endif // !_XM_F16C_INTRINSICS_ -} - -//------------------------------------------------------------------------------ -#pragma prefast(push) -#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) - -_Use_decl_annotations_ -inline float* PackedVector::XMConvertHalfToFloatStream -( - float* pOutputStream, - size_t OutputStride, - const HALF* pInputStream, - size_t InputStride, - size_t HalfCount -) -{ - assert(pOutputStream); - assert(pInputStream); - - assert(InputStride >= sizeof(HALF)); - _Analysis_assume_(InputStride >= sizeof(HALF)); - - assert(OutputStride >= sizeof(float)); - _Analysis_assume_(OutputStride >= sizeof(float)); - -#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - const uint8_t* pHalf = reinterpret_cast(pInputStream); - uint8_t* pFloat = reinterpret_cast(pOutputStream); - - size_t i = 0; - size_t four = HalfCount >> 2; - if ( four > 0 ) - { - if (InputStride == sizeof(HALF)) - { - if (OutputStride == sizeof(float)) - { - if ( ((uintptr_t)pFloat & 0xF) == 0) - { - // Packed input, aligned & packed output - for (size_t j = 0; j < four; ++j) - { - __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); - pHalf += InputStride*4; - - __m128 FV = _mm_cvtph_ps( HV ); - - XM_STREAM_PS( reinterpret_cast(pFloat), FV ); - pFloat += OutputStride*4; - i += 4; - } - } - else - { - // Packed input, packed output - for (size_t j = 0; j < four; ++j) - { - __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); - pHalf += InputStride*4; - - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_storeu_ps( reinterpret_cast(pFloat), FV ); - pFloat += OutputStride*4; - i += 4; - } - } - } - else - { - // Packed input, scattered output - for (size_t j = 0; j < four; ++j) - { - __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); - pHalf += InputStride*4; - - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_store_ss( reinterpret_cast(pFloat), FV ); - pFloat += OutputStride; - *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 1 ); - pFloat += OutputStride; - *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 2 ); - pFloat += OutputStride; - *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 3 ); - pFloat += OutputStride; - i += 4; - } - } - } - else if (OutputStride == sizeof(float)) - { - if ( ((uintptr_t)pFloat & 0xF) == 0) - { - // Scattered input, aligned & packed output - for (size_t j = 0; j < four; ++j) - { - uint16_t H1 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H2 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H3 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H4 = *reinterpret_cast(pHalf); - pHalf += InputStride; - - __m128i HV = _mm_setzero_si128(); - HV = _mm_insert_epi16( HV, H1, 0 ); - HV = _mm_insert_epi16( HV, H2, 1 ); - HV = _mm_insert_epi16( HV, H3, 2 ); - HV = _mm_insert_epi16( HV, H4, 3 ); - __m128 FV = _mm_cvtph_ps( HV ); - - XM_STREAM_PS( reinterpret_cast(pFloat ), FV ); - pFloat += OutputStride*4; - i += 4; - } - } - else - { - // Scattered input, packed output - for (size_t j = 0; j < four; ++j) - { - uint16_t H1 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H2 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H3 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H4 = *reinterpret_cast(pHalf); - pHalf += InputStride; - - __m128i HV = _mm_setzero_si128(); - HV = _mm_insert_epi16( HV, H1, 0 ); - HV = _mm_insert_epi16( HV, H2, 1 ); - HV = _mm_insert_epi16( HV, H3, 2 ); - HV = _mm_insert_epi16( HV, H4, 3 ); - __m128 FV = _mm_cvtph_ps( HV ); - - _mm_storeu_ps( reinterpret_cast(pFloat ), FV ); - pFloat += OutputStride*4; - i += 4; - } - } - } - else - { - // Scattered input, scattered output - for (size_t j = 0; j < four; ++j) - { - uint16_t H1 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H2 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H3 = *reinterpret_cast(pHalf); - pHalf += InputStride; - uint16_t H4 = *reinterpret_cast(pHalf); - pHalf += InputStride; - - __m128i HV = _mm_setzero_si128(); - HV = _mm_insert_epi16(HV, H1, 0); - HV = _mm_insert_epi16(HV, H2, 1); - HV = _mm_insert_epi16(HV, H3, 2); - HV = _mm_insert_epi16(HV, H4, 3); - __m128 FV = _mm_cvtph_ps(HV); - - _mm_store_ss(reinterpret_cast(pFloat), FV); - pFloat += OutputStride; - *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 1); - pFloat += OutputStride; - *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 2); - pFloat += OutputStride; - *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 3); - pFloat += OutputStride; - i += 4; - } - } - } - - for (; i < HalfCount; ++i) - { - *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); - pHalf += InputStride; - pFloat += OutputStride; - } - - XM_SFENCE(); - - return pOutputStream; -#else - const uint8_t* pHalf = reinterpret_cast(pInputStream); - uint8_t* pFloat = reinterpret_cast(pOutputStream); - - for (size_t i = 0; i < HalfCount; i++) - { - *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); - pHalf += InputStride; - pFloat += OutputStride; - } - - return pOutputStream; -#endif // !_XM_F16C_INTRINSICS_ -} - -//------------------------------------------------------------------------------ - -inline PackedVector::HALF PackedVector::XMConvertFloatToHalf -( - float Value -) -{ -#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - __m128 V1 = _mm_set_ss( Value ); - __m128i V2 = _mm_cvtps_ph( V1, 0 ); - return static_cast( _mm_cvtsi128_si32(V2) ); -#else - uint32_t Result; - - uint32_t IValue = reinterpret_cast(&Value)[0]; - uint32_t Sign = (IValue & 0x80000000U) >> 16U; - IValue = IValue & 0x7FFFFFFFU; // Hack off the sign - - if (IValue > 0x477FE000U) - { - // The number is too large to be represented as a half. Saturate to infinity. - if (((IValue & 0x7F800000) == 0x7F800000) && ((IValue & 0x7FFFFF ) != 0)) - { - Result = 0x7FFF; // NAN - } - else - { - Result = 0x7C00U; // INF - } - } - else - { - if (IValue < 0x38800000U) - { - // The number is too small to be represented as a normalized half. - // Convert it to a denormalized value. - uint32_t Shift = 113U - (IValue >> 23U); - IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift; - } - else - { - // Rebias the exponent to represent the value as a normalized half. - IValue += 0xC8000000U; - } - - Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU; - } - return (HALF)(Result|Sign); -#endif // !_XM_F16C_INTRINSICS_ -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream -( - HALF* pOutputStream, - size_t OutputStride, - const float* pInputStream, - size_t InputStride, - size_t FloatCount -) -{ - assert(pOutputStream); - assert(pInputStream); - - assert(InputStride >= sizeof(float)); - _Analysis_assume_(InputStride >= sizeof(float)); - - assert(OutputStride >= sizeof(HALF)); - _Analysis_assume_(OutputStride >= sizeof(HALF)); - -#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - const uint8_t* pFloat = reinterpret_cast(pInputStream); - uint8_t* pHalf = reinterpret_cast(pOutputStream); - - size_t i = 0; - size_t four = FloatCount >> 2; - if (four > 0) - { - if (InputStride == sizeof(float)) - { - if (OutputStride == sizeof(HALF)) - { - if ( ((uintptr_t)pFloat & 0xF) == 0) - { - // Aligned and packed input, packed output - for (size_t j = 0; j < four; ++j) - { - __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); - pFloat += InputStride*4; - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); - pHalf += OutputStride*4; - i += 4; - } - } - else - { - // Packed input, packed output - for (size_t j = 0; j < four; ++j) - { - __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); - pFloat += InputStride*4; - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); - pHalf += OutputStride*4; - i += 4; - } - } - } - else - { - if ( ((uintptr_t)pFloat & 0xF) == 0) - { - // Aligned & packed input, scattered output - for (size_t j = 0; j < four; ++j) - { - __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); - pFloat += InputStride*4; - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); - pHalf += OutputStride; - i += 4; - } - } - else - { - // Packed input, scattered output - for (size_t j = 0; j < four; ++j) - { - __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); - pFloat += InputStride*4; - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); - pHalf += OutputStride; - i += 4; - } - } - } - } - else if (OutputStride == sizeof(HALF)) - { - // Scattered input, packed output - for (size_t j = 0; j < four; ++j) - { - __m128 FV1 = _mm_load_ss( reinterpret_cast(pFloat) ); - pFloat += InputStride; - - __m128 FV2 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); - pFloat += InputStride; - - __m128 FV3 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); - pFloat += InputStride; - - __m128 FV4 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); - pFloat += InputStride; - - __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 ); - __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 ); - FV = _mm_blend_ps( FV, FT, 0xC ); - - __m128i HV = _mm_cvtps_ph( FV, 0 ); - - _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); - pHalf += OutputStride*4; - i += 4; - } - } - else - { - // Scattered input, scattered output - for (size_t j = 0; j < four; ++j) - { - __m128 FV1 = _mm_load_ss(reinterpret_cast(pFloat)); - pFloat += InputStride; - - __m128 FV2 = _mm_broadcast_ss(reinterpret_cast(pFloat)); - pFloat += InputStride; - - __m128 FV3 = _mm_broadcast_ss(reinterpret_cast(pFloat)); - pFloat += InputStride; - - __m128 FV4 = _mm_broadcast_ss(reinterpret_cast(pFloat)); - pFloat += InputStride; - - __m128 FV = _mm_blend_ps(FV1, FV2, 0x2); - __m128 FT = _mm_blend_ps(FV3, FV4, 0x8); - FV = _mm_blend_ps(FV, FT, 0xC); - - __m128i HV = _mm_cvtps_ph(FV, 0); - - *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); - pHalf += OutputStride; - *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); - pHalf += OutputStride; - i += 4; - } - } - } - - for (; i < FloatCount; ++i) - { - *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); - pFloat += InputStride; - pHalf += OutputStride; - } - - return pOutputStream; -#else - const uint8_t* pFloat = reinterpret_cast(pInputStream); - uint8_t* pHalf = reinterpret_cast(pOutputStream); - - for (size_t i = 0; i < FloatCount; i++) - { - *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); - pFloat += InputStride; - pHalf += OutputStride; - } - return pOutputStream; -#endif // !_XM_F16C_INTRINSICS_ -} - -#pragma prefast(pop) - -/**************************************************************************** - * - * Vector and matrix load operations - * - ****************************************************************************/ -#pragma prefast(push) -#pragma prefast(disable:28931, "PREfast noise: Esp:1266") - -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadColor -( - const XMCOLOR* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - // int32_t -> Float conversions are done in one instruction. - // uint32_t -> Float calls a runtime function. Keep in int32_t - int32_t iColor = (int32_t)(pSource->c); - XMVECTORF32 vColor = { - (float)((iColor >> 16) & 0xFF) * (1.0f/255.0f), - (float)((iColor >> 8) & 0xFF) * (1.0f/255.0f), - (float)(iColor & 0xFF) * (1.0f/255.0f), - (float)((iColor >> 24) & 0xFF) * (1.0f/255.0f) - }; - return vColor.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32_t bgra = pSource->c; - uint32_t rgba = (bgra & 0xFF00FF00) | ((bgra >> 16) & 0xFF) | ((bgra << 16) & 0xFF0000); - uint32x2_t vInt8 = vdup_n_u32(rgba); - uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) ); - uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); - float32x4_t R = vcvtq_f32_u32(vInt); - return vmulq_n_f32( R, 1.0f/255.0f ); -#elif defined(_XM_SSE_INTRINSICS_) - // Splat the color in all four entries - __m128i vInt = _mm_set1_epi32(pSource->c); - // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 - vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8); - // a is unsigned! Flip the bit to convert the order to signed - vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8); - // Convert to floating point numbers - XMVECTOR vTemp = _mm_cvtepi32_ps(vInt); - // RGB + 0, A + 0x80000000.f to undo the signed order. - vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8); - // Convert 0-255 to 0.0f-1.0f - return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf2 -( - const XMHALF2* pSource -) -{ - assert(pSource); -#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - __m128 V = _mm_load_ss( reinterpret_cast(pSource) ); - return _mm_cvtph_ps( _mm_castps_si128( V ) ); -#else - XMVECTORF32 vResult = { - XMConvertHalfToFloat(pSource->x), - XMConvertHalfToFloat(pSource->y), - 0.0f, - 0.0f - }; - return vResult.v; -#endif // !_XM_F16C_INTRINSICS_ -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShortN2 -( - const XMSHORTN2* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (pSource->x == -32768) ? -1.f : ((float)pSource->x * (1.0f/32767.0f)), - (pSource->y == -32768) ? -1.f : ((float)pSource->y * (1.0f/32767.0f)), - 0.0f, - 0.0f - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast( pSource ) ); - int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) ); - vInt = vandq_s32( vInt, g_XMMaskXY ); - float32x4_t R = vcvtq_f32_s32(vInt); - R = vmulq_n_f32( R, 1.0f/32767.0f ); - return vmaxq_f32( R, vdupq_n_f32(-1.f) ); -#elif defined(_XM_SSE_INTRINSICS_) - // Splat the two shorts in all four entries (WORD alignment okay, - // DWORD alignment preferred) - __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); - // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 - vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); - // x needs to be sign extended - vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // x - 0x8000 to undo the signed order. - vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16); - // Convert -1.0f - 1.0f - vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16); - // Clamp result (for case of -32768) - return _mm_max_ps( vTemp, g_XMNegativeOne ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShort2 -( - const XMSHORT2* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (float)pSource->x, - (float)pSource->y, - 0.f, - 0.f - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast( pSource ) ); - int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) ); - vInt = vandq_s32( vInt, g_XMMaskXY ); - return vcvtq_f32_s32(vInt); -#elif defined(_XM_SSE_INTRINSICS_) - // Splat the two shorts in all four entries (WORD alignment okay, - // DWORD alignment preferred) - __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); - // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 - vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); - // x needs to be sign extended - vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // x - 0x8000 to undo the signed order. - vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16); - // Y is 65536 too large - return _mm_mul_ps(vTemp,g_XMFixupY16); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShortN2 -( - const XMUSHORTN2* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (float)pSource->x / 65535.0f, - (float)pSource->y / 65535.0f, - 0.f, - 0.f - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast( pSource ) ); - uint32x4_t vInt = vmovl_u16( vreinterpret_u16_u32(vInt16) ); - vInt = vandq_u32( vInt, g_XMMaskXY ); - float32x4_t R = vcvtq_f32_u32(vInt); - R = vmulq_n_f32( R, 1.0f/65535.0f ); - return vmaxq_f32( R, vdupq_n_f32(-1.f) ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f}; - static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0}; - // Splat the two shorts in all four entries (WORD alignment okay, - // DWORD alignment preferred) - __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); - // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 - vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); - // y needs to be sign flipped - vTemp = _mm_xor_ps(vTemp,g_XMFlipY); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // y + 0x8000 to undo the signed order. - vTemp = _mm_add_ps(vTemp,FixaddY16); - // Y is 65536 times too large - vTemp = _mm_mul_ps(vTemp,FixupY16); - return vTemp; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShort2 -( - const XMUSHORT2* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (float)pSource->x, - (float)pSource->y, - 0.f, - 0.f - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast( pSource ) ); - uint32x4_t vInt = vmovl_u16( vreinterpret_u16_u32(vInt16) ); - vInt = vandq_u32( vInt, g_XMMaskXY ); - return vcvtq_f32_u32(vInt); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0}; - // Splat the two shorts in all four entries (WORD alignment okay, - // DWORD alignment preferred) - __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); - // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 - vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); - // y needs to be sign flipped - vTemp = _mm_xor_ps(vTemp,g_XMFlipY); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // Y is 65536 times too large - vTemp = _mm_mul_ps(vTemp,g_XMFixupY16); - // y + 0x8000 to undo the signed order. - vTemp = _mm_add_ps(vTemp,FixaddY16); - return vTemp; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByteN2 -( - const XMBYTEN2* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (pSource->x == -128) ? -1.f : ((float)pSource->x * (1.0f/127.0f)), - (pSource->y == -128) ? -1.f : ((float)pSource->y * (1.0f/127.0f)), - 0.0f, - 0.0f - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast( pSource ) ); - int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u16(vInt8) ); - int32x4_t vInt = vmovl_s16( vget_low_s16( vInt16 ) ); - vInt = vandq_s32( vInt, g_XMMaskXY ); - float32x4_t R = vcvtq_f32_s32(vInt); - R = vmulq_n_f32( R, 1.0f/127.0f ); - return vmaxq_f32( R, vdupq_n_f32(-1.f) ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Scale = {1.0f/127.0f,1.0f/(127.0f*256.0f),0,0}; - static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0}; - // Splat the color in all four entries (x,z,y,w) - XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); - // Mask - vTemp = _mm_and_ps(vTemp,Mask); - // x,y and z are unsigned! Flip the bits to convert the order to signed - vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // x, y and z - 0x80 to complete the conversion - vTemp = _mm_add_ps(vTemp,g_XMAddByte4); - // Fix y, z and w because they are too large - vTemp = _mm_mul_ps(vTemp,Scale); - // Clamp result (for case of -128) - return _mm_max_ps( vTemp, g_XMNegativeOne ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByte2 -( - const XMBYTE2* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (float)pSource->x, - (float)pSource->y, - 0.0f, - 0.0f - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast( pSource ) ); - int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u16(vInt8) ); - int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) ); - vInt = vandq_s32( vInt, g_XMMaskXY ); - return vcvtq_f32_s32(vInt); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Scale = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)}; - static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0}; - // Splat the color in all four entries (x,z,y,w) - XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); - // Mask - vTemp = _mm_and_ps(vTemp,Mask); - // x,y and z are unsigned! Flip the bits to convert the order to signed - vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // x, y and z - 0x80 to complete the conversion - vTemp = _mm_add_ps(vTemp,g_XMAddByte4); - // Fix y, z and w because they are too large - return _mm_mul_ps(vTemp,Scale); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByteN2 -( - const XMUBYTEN2* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (float)pSource->x * (1.0f/255.0f), - (float)pSource->y * (1.0f/255.0f), - 0.0f, - 0.0f - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast( pSource ) ); - uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u16(vInt8) ); - uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); - vInt = vandq_u32( vInt, g_XMMaskXY ); - float32x4_t R = vcvtq_f32_u32(vInt); - return vmulq_n_f32( R, 1.0f/255.0f ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Scale = {1.0f/255.0f,1.0f/(255.0f*256.0f),0,0}; - static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0}; - // Splat the color in all four entries (x,z,y,w) - XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); - // Mask - vTemp = _mm_and_ps(vTemp,Mask); - // w is signed! Flip the bits to convert the order to unsigned - vTemp = _mm_xor_ps(vTemp,g_XMFlipW); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // w + 0x80 to complete the conversion - vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); - // Fix y, z and w because they are too large - return _mm_mul_ps(vTemp,Scale); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByte2 -( - const XMUBYTE2* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (float)pSource->x, - (float)pSource->y, - 0.0f, - 0.0f - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast( pSource ) ); - uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) ); - uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); - vInt = vandq_s32( vInt, g_XMMaskXY ); - return vcvtq_f32_u32(vInt); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Scale = {1.0f,1.0f/256.0f,0,0}; - static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0}; - // Splat the color in all four entries (x,z,y,w) - XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); - // Mask - vTemp = _mm_and_ps(vTemp,Mask); - // w is signed! Flip the bits to convert the order to unsigned - vTemp = _mm_xor_ps(vTemp,g_XMFlipW); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // w + 0x80 to complete the conversion - vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); - // Fix y, z and w because they are too large - return _mm_mul_ps(vTemp,Scale); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadU565 -( - const XMU565* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - float(pSource->v & 0x1F), - float((pSource->v >> 5) & 0x3F), - float((pSource->v >> 11) & 0x1F), - 0.f, - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0}; - static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0}; - uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast( pSource ) ); - uint32x4_t vInt = vmovl_u16( vInt16 ); - vInt = vandq_u32(vInt,U565And); - float32x4_t R = vcvtq_f32_u32(vInt); - return vmulq_f32(R,U565Mul); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0}; - static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0}; - // Get the 32 bit value and splat it - XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); - // Mask off x, y and z - vResult = _mm_and_ps(vResult,U565And); - // Convert to float - vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); - // Normalize x, y, and z - vResult = _mm_mul_ps(vResult,U565Mul); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadFloat3PK -( - const XMFLOAT3PK* pSource -) -{ - assert(pSource); - - __declspec(align(16)) uint32_t Result[4]; - uint32_t Mantissa; - uint32_t Exponent; - - // X Channel (6-bit mantissa) - Mantissa = pSource->xm; - - if ( pSource->xe == 0x1f ) // INF or NAN - { - Result[0] = 0x7f800000 | (pSource->xm << 17); - } - else - { - if ( pSource->xe != 0 ) // The value is normalized - { - Exponent = pSource->xe; - } - else if (Mantissa != 0) // The value is denormalized - { - // Normalize the value in the resulting float - Exponent = 1; - - do - { - Exponent--; - Mantissa <<= 1; - } while ((Mantissa & 0x40) == 0); - - Mantissa &= 0x3F; - } - else // The value is zero - { - Exponent = (uint32_t)-112; - } - - Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17); - } - - // Y Channel (6-bit mantissa) - Mantissa = pSource->ym; - - if ( pSource->ye == 0x1f ) // INF or NAN - { - Result[1] = 0x7f800000 | (pSource->ym << 17); - } - else - { - if ( pSource->ye != 0 ) // The value is normalized - { - Exponent = pSource->ye; - } - else if (Mantissa != 0) // The value is denormalized - { - // Normalize the value in the resulting float - Exponent = 1; - - do - { - Exponent--; - Mantissa <<= 1; - } while ((Mantissa & 0x40) == 0); - - Mantissa &= 0x3F; - } - else // The value is zero - { - Exponent = (uint32_t)-112; - } - - Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17); - } - - // Z Channel (5-bit mantissa) - Mantissa = pSource->zm; - - if ( pSource->ze == 0x1f ) // INF or NAN - { - Result[2] = 0x7f800000 | (pSource->zm << 17); - } - else - { - if ( pSource->ze != 0 ) // The value is normalized - { - Exponent = pSource->ze; - } - else if (Mantissa != 0) // The value is denormalized - { - // Normalize the value in the resulting float - Exponent = 1; - - do - { - Exponent--; - Mantissa <<= 1; - } while ((Mantissa & 0x20) == 0); - - Mantissa &= 0x1F; - } - else // The value is zero - { - Exponent = (uint32_t)-112; - } - - Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18); - } - - return XMLoadFloat3A( reinterpret_cast(&Result) ); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadFloat3SE -( - const XMFLOAT3SE* pSource -) -{ - assert(pSource); - - union { float f; int32_t i; } fi; - fi.i = 0x33800000 + (pSource->e << 23); - float Scale = fi.f; - - XMVECTORF32 v = { - Scale * float( pSource->xm ), - Scale * float( pSource->ym ), - Scale * float( pSource->zm ), - 1.0f }; - return v; -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf4 -( - const XMHALF4* pSource -) -{ - assert(pSource); -#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); - return _mm_cvtph_ps( V ); -#else - XMVECTORF32 vResult = { - XMConvertHalfToFloat(pSource->x), - XMConvertHalfToFloat(pSource->y), - XMConvertHalfToFloat(pSource->z), - XMConvertHalfToFloat(pSource->w) - }; - return vResult.v; -#endif // !_XM_F16C_INTRINSICS_ -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShortN4 -( - const XMSHORTN4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (pSource->x == -32768) ? -1.f : ((float)pSource->x * (1.0f/32767.0f)), - (pSource->y == -32768) ? -1.f : ((float)pSource->y * (1.0f/32767.0f)), - (pSource->z == -32768) ? -1.f : ((float)pSource->z * (1.0f/32767.0f)), - (pSource->w == -32768) ? -1.f : ((float)pSource->w * (1.0f/32767.0f)) - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - int16x4_t vInt = vld1_s16( (const int16_t*)pSource ); - int32x4_t V = vmovl_s16( vInt ); - V = vcvtq_f32_s32( V ); - V = vmulq_n_f32( V, 1.0f/32767.0f ); - return vmaxq_f32( V, vdupq_n_f32(-1.f) ); -#elif defined(_XM_SSE_INTRINSICS_) - // Splat the color in all four entries (x,z,y,w) - __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); - // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 - __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); - // x and z are unsigned! Flip the bits to convert the order to signed - vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // x and z - 0x8000 to complete the conversion - vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16); - // Convert to -1.0f - 1.0f - vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16); - // Very important! The entries are x,z,y,w, flip it to x,y,z,w - vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); - // Clamp result (for case of -32768) - return _mm_max_ps( vTemp, g_XMNegativeOne ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShort4 -( - const XMSHORT4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (float)pSource->x, - (float)pSource->y, - (float)pSource->z, - (float)pSource->w - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - int16x4_t vInt = vld1_s16( (const int16_t*)pSource ); - int32x4_t V = vmovl_s16( vInt ); - return vcvtq_f32_s32( V ); -#elif defined(_XM_SSE_INTRINSICS_) - // Splat the color in all four entries (x,z,y,w) - __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); - // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 - __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); - // x and z are unsigned! Flip the bits to convert the order to signed - vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // x and z - 0x8000 to complete the conversion - vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16); - // Fix y and w because they are 65536 too large - vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16); - // Very important! The entries are x,z,y,w, flip it to x,y,z,w - return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShortN4 -( - const XMUSHORTN4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (float)pSource->x / 65535.0f, - (float)pSource->y / 65535.0f, - (float)pSource->z / 65535.0f, - (float)pSource->w / 65535.0f - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint16x4_t vInt = vld1_u16( (const uint16_t*)pSource ); - uint32x4_t V = vmovl_u16( vInt ); - V = vcvtq_f32_u32( V ); - return vmulq_n_f32( V, 1.0f/65535.0f ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f*65536.0f),1.0f/(65535.0f*65536.0f)}; - static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f*65536.0f,32768.0f*65536.0f}; - // Splat the color in all four entries (x,z,y,w) - __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); - // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 - __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); - // y and w are signed! Flip the bits to convert the order to unsigned - vTemp = _mm_xor_ps(vTemp,g_XMFlipZW); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // y and w + 0x8000 to complete the conversion - vTemp = _mm_add_ps(vTemp,FixaddY16W16); - // Fix y and w because they are 65536 too large - vTemp = _mm_mul_ps(vTemp,FixupY16W16); - // Very important! The entries are x,z,y,w, flip it to x,y,z,w - return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShort4 -( - const XMUSHORT4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (float)pSource->x, - (float)pSource->y, - (float)pSource->z, - (float)pSource->w - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint16x4_t vInt = vld1_u16( (const uint16_t*)pSource ); - uint32x4_t V = vmovl_u16( vInt ); - return vcvtq_f32_u32( V ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f,32768.0f}; - // Splat the color in all four entries (x,z,y,w) - __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); - // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 - __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); - // y and w are signed! Flip the bits to convert the order to unsigned - vTemp = _mm_xor_ps(vTemp,g_XMFlipZW); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // Fix y and w because they are 65536 too large - vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16); - // y and w + 0x8000 to complete the conversion - vTemp = _mm_add_ps(vTemp,FixaddY16W16); - // Very important! The entries are x,z,y,w, flip it to x,y,z,w - return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadXDecN4 -( - const XMXDECN4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; - - uint32_t ElementX = pSource->v & 0x3FF; - uint32_t ElementY = (pSource->v >> 10) & 0x3FF; - uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; - - XMVECTORF32 vResult = { - (ElementX == 0x200) ? -1.f : ((float)(int16_t)(ElementX | SignExtend[ElementX >> 9]) / 511.0f), - (ElementY == 0x200) ? -1.f : ((float)(int16_t)(ElementY | SignExtend[ElementY >> 9]) / 511.0f), - (ElementZ == 0x200) ? -1.f : ((float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]) / 511.0f), - (float)(pSource->v >> 30) / 3.0f - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); - vInt = vandq_u32(vInt,g_XMMaskA2B10G10R10); - vInt = veorq_u32(vInt,g_XMFlipA2B10G10R10); - float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) ); - R = vaddq_f32(R,g_XMFixAA2B10G10R10); - R = vmulq_f32(R,g_XMNormalizeA2B10G10R10); - return vmaxq_f32( R, vdupq_n_f32(-1.0f) ); -#elif defined(_XM_SSE_INTRINSICS_) - // Splat the color in all four entries - __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); - // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 - vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10); - // a is unsigned! Flip the bit to convert the order to signed - vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // RGB + 0, A + 0x80000000.f to undo the signed order. - vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10); - // Convert 0-255 to 0.0f-1.0f - vTemp = _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10); - // Clamp result (for case of -512) - return _mm_max_ps( vTemp, g_XMNegativeOne ); -#endif -} - -//------------------------------------------------------------------------------ -#pragma warning(push) -#pragma warning(disable : 4996) -// C4996: ignore deprecation warning - -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadXDec4 -( - const XMXDEC4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; - - uint32_t ElementX = pSource->v & 0x3FF; - uint32_t ElementY = (pSource->v >> 10) & 0x3FF; - uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; - - XMVECTORF32 vResult = { - (float)(int16_t)(ElementX | SignExtend[ElementX >> 9]), - (float)(int16_t)(ElementY | SignExtend[ElementY >> 9]), - (float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]), - (float)(pSource->v >> 30) - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORU32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000}; - static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f}; - uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); - vInt = vandq_u32(vInt,g_XMMaskDec4); - vInt = veorq_u32(vInt,XDec4Xor); - float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) ); - R = vaddq_f32(R ,XDec4Add); - return vmulq_f32(R,g_XMMulDec4); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORU32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000}; - static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f}; - // Splat the color in all four entries - XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); - // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 - vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); - // a is unsigned! Flip the bit to convert the order to signed - vTemp = _mm_xor_ps(vTemp,XDec4Xor); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // RGB + 0, A + 0x80000000.f to undo the signed order. - vTemp = _mm_add_ps(vTemp,XDec4Add); - // Convert 0-255 to 0.0f-1.0f - vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); - return vTemp; -#endif -} - -#pragma warning(pop) - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUDecN4 -( - const XMUDECN4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - - uint32_t ElementX = pSource->v & 0x3FF; - uint32_t ElementY = (pSource->v >> 10) & 0x3FF; - uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; - - XMVECTORF32 vResult = { - (float)ElementX / 1023.0f, - (float)ElementY / 1023.0f, - (float)ElementZ / 1023.0f, - (float)(pSource->v >> 30) / 3.0f - }; - return vResult.v; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)}; - uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); - vInt = vandq_u32(vInt,g_XMMaskDec4); - float32x4_t R = vcvtq_f32_u32( vInt ); - return vmulq_f32(R,UDecN4Mul); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)}; - // Splat the color in all four entries - XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); - // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 - vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); - // a is unsigned! Flip the bit to convert the order to signed - vTemp = _mm_xor_ps(vTemp,g_XMFlipW); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // RGB + 0, A + 0x80000000.f to undo the signed order. - vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); - // Convert 0-255 to 0.0f-1.0f - vTemp = _mm_mul_ps(vTemp,UDecN4Mul); - return vTemp; -#endif -} - - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUDecN4_XR -( - const XMUDECN4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - - int32_t ElementX = pSource->v & 0x3FF; - int32_t ElementY = (pSource->v >> 10) & 0x3FF; - int32_t ElementZ = (pSource->v >> 20) & 0x3FF; - - XMVECTORF32 vResult = { - (float)(ElementX - 0x180) / 510.0f, - (float)(ElementY - 0x180) / 510.0f, - (float)(ElementZ - 0x180) / 510.0f, - (float)(pSource->v >> 30) / 3.0f - }; - - return vResult.v; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 XRMul = {1.0f/510.0f,1.0f/(510.0f*1024.0f),1.0f/(510.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)}; - static const XMVECTORI32 XRBias = { 0x180, 0x180*1024, 0x180*1024*1024, 0 }; - uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); - vInt = vandq_u32(vInt,g_XMMaskDec4); - int32x4_t vTemp = vsubq_s32( vreinterpretq_s32_u32(vInt), XRBias ); - vTemp = veorq_u32( vTemp, g_XMFlipW ); - float32x4_t R = vcvtq_f32_s32( vTemp ); - R = vaddq_f32(R,g_XMAddUDec4); - return vmulq_f32(R,XRMul); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 XRMul = {1.0f/510.0f,1.0f/(510.0f*1024.0f),1.0f/(510.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)}; - static const XMVECTORI32 XRBias = { 0x180, 0x180*1024, 0x180*1024*1024, 0 }; - // Splat the color in all four entries - XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); - // Mask channels - vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); - // Subtract bias - vTemp = _mm_castsi128_ps( _mm_sub_epi32( _mm_castps_si128(vTemp), XRBias ) ); - // a is unsigned! Flip the bit to convert the order to signed - vTemp = _mm_xor_ps(vTemp,g_XMFlipW); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // RGB + 0, A + 0x80000000.f to undo the signed order. - vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); - // Convert to 0.0f-1.0f - return _mm_mul_ps(vTemp,XRMul); -#endif -} - - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUDec4 -( - const XMUDEC4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - uint32_t ElementX = pSource->v & 0x3FF; - uint32_t ElementY = (pSource->v >> 10) & 0x3FF; - uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; - - XMVECTORF32 vResult = { - (float)ElementX, - (float)ElementY, - (float)ElementZ, - (float)(pSource->v >> 30) - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); - vInt = vandq_u32(vInt,g_XMMaskDec4); - float32x4_t R = vcvtq_f32_u32( vInt ); - return vmulq_f32(R,g_XMMulDec4); -#elif defined(_XM_SSE_INTRINSICS_) - // Splat the color in all four entries - XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); - // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 - vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); - // a is unsigned! Flip the bit to convert the order to signed - vTemp = _mm_xor_ps(vTemp,g_XMFlipW); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // RGB + 0, A + 0x80000000.f to undo the signed order. - vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); - // Convert 0-255 to 0.0f-1.0f - vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); - return vTemp; -#endif -} - -//------------------------------------------------------------------------------ -#pragma warning(push) -#pragma warning(disable : 4996) -// C4996: ignore deprecation warning - -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadDecN4 -( - const XMDECN4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; - static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC}; - - uint32_t ElementX = pSource->v & 0x3FF; - uint32_t ElementY = (pSource->v >> 10) & 0x3FF; - uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; - uint32_t ElementW = pSource->v >> 30; - - XMVECTORF32 vResult = { - (ElementX == 0x200) ? -1.f : ((float)(int16_t)(ElementX | SignExtend[ElementX >> 9]) / 511.0f), - (ElementY == 0x200) ? -1.f : ((float)(int16_t)(ElementY | SignExtend[ElementY >> 9]) / 511.0f), - (ElementZ == 0x200) ? -1.f : ((float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]) / 511.0f), - (ElementW == 0x2) ? -1.f : ((float)(int16_t)(ElementW | SignExtendW[(ElementW >> 1) & 1])) - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)}; - uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); - vInt = vandq_u32(vInt,g_XMMaskDec4); - vInt = veorq_u32(vInt,g_XMXorDec4); - float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) ); - R = vaddq_f32(R,g_XMAddDec4); - R = vmulq_f32(R,DecN4Mul); - return vmaxq_f32( R, vdupq_n_f32(-1.0f) ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)}; - // Splat the color in all four entries - XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); - // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 - vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); - // a is unsigned! Flip the bit to convert the order to signed - vTemp = _mm_xor_ps(vTemp,g_XMXorDec4); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // RGB + 0, A + 0x80000000.f to undo the signed order. - vTemp = _mm_add_ps(vTemp,g_XMAddDec4); - // Convert 0-255 to 0.0f-1.0f - vTemp = _mm_mul_ps(vTemp,DecN4Mul); - // Clamp result (for case of -512/-1) - return _mm_max_ps( vTemp, g_XMNegativeOne ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadDec4 -( - const XMDEC4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; - static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC}; - - uint32_t ElementX = pSource->v & 0x3FF; - uint32_t ElementY = (pSource->v >> 10) & 0x3FF; - uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; - uint32_t ElementW = pSource->v >> 30; - - XMVECTORF32 vResult = { - (float)(int16_t)(ElementX | SignExtend[ElementX >> 9]), - (float)(int16_t)(ElementY | SignExtend[ElementY >> 9]), - (float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]), - (float)(int16_t)(ElementW | SignExtendW[ElementW >> 1]) - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); - vInt = vandq_u32(vInt,g_XMMaskDec4); - vInt = veorq_u32(vInt,g_XMXorDec4); - float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) ); - R = vaddq_f32(R,g_XMAddDec4); - return vmulq_f32(R,g_XMMulDec4); -#elif defined(_XM_SSE_INTRINSICS_) - // Splat the color in all four entries - XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); - // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 - vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); - // a is unsigned! Flip the bit to convert the order to signed - vTemp = _mm_xor_ps(vTemp,g_XMXorDec4); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // RGB + 0, A + 0x80000000.f to undo the signed order. - vTemp = _mm_add_ps(vTemp,g_XMAddDec4); - // Convert 0-255 to 0.0f-1.0f - vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); - return vTemp; -#endif -} - -#pragma warning(pop) - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByteN4 -( - const XMUBYTEN4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (float)pSource->x / 255.0f, - (float)pSource->y / 255.0f, - (float)pSource->z / 255.0f, - (float)pSource->w / 255.0f - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast( pSource ) ); - uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) ); - uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); - float32x4_t R = vcvtq_f32_u32(vInt); - return vmulq_n_f32( R, 1.0f/255.0f ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f*256.0f),1.0f/(255.0f*65536.0f),1.0f/(255.0f*65536.0f*256.0f)}; - // Splat the color in all four entries (x,z,y,w) - XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); - // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 - vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); - // w is signed! Flip the bits to convert the order to unsigned - vTemp = _mm_xor_ps(vTemp,g_XMFlipW); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // w + 0x80 to complete the conversion - vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); - // Fix y, z and w because they are too large - vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul); - return vTemp; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByte4 -( - const XMUBYTE4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (float)pSource->x, - (float)pSource->y, - (float)pSource->z, - (float)pSource->w - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast( pSource ) ); - uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) ); - uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); - return vcvtq_f32_u32(vInt); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)}; - // Splat the color in all four entries (x,z,y,w) - XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); - // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 - vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); - // w is signed! Flip the bits to convert the order to unsigned - vTemp = _mm_xor_ps(vTemp,g_XMFlipW); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // w + 0x80 to complete the conversion - vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); - // Fix y, z and w because they are too large - vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul); - return vTemp; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByteN4 -( - const XMBYTEN4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (pSource->x == -128) ? -1.f : ((float)pSource->x / 127.0f), - (pSource->y == -128) ? -1.f : ((float)pSource->y / 127.0f), - (pSource->z == -128) ? -1.f : ((float)pSource->z / 127.0f), - (pSource->w == -128) ? -1.f : ((float)pSource->w / 127.0f) - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast( pSource ) ); - int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u32(vInt8) ); - int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) ); - float32x4_t R = vcvtq_f32_s32(vInt); - R = vmulq_n_f32( R, 1.0f/127.0f ); - return vmaxq_f32( R, vdupq_n_f32(-1.f) ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f*256.0f),1.0f/(127.0f*65536.0f),1.0f/(127.0f*65536.0f*256.0f)}; - // Splat the color in all four entries (x,z,y,w) - XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); - // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 - vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); - // x,y and z are unsigned! Flip the bits to convert the order to signed - vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // x, y and z - 0x80 to complete the conversion - vTemp = _mm_add_ps(vTemp,g_XMAddByte4); - // Fix y, z and w because they are too large - vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul); - // Clamp result (for case of -128) - return _mm_max_ps( vTemp, g_XMNegativeOne ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByte4 -( - const XMBYTE4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - (float)pSource->x, - (float)pSource->y, - (float)pSource->z, - (float)pSource->w - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast( pSource ) ); - int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u32(vInt8) ); - int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) ); - return vcvtq_f32_s32(vInt); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)}; - // Splat the color in all four entries (x,z,y,w) - XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); - // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 - vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); - // x,y and z are unsigned! Flip the bits to convert the order to signed - vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); - // Convert to floating point numbers - vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); - // x, y and z - 0x80 to complete the conversion - vTemp = _mm_add_ps(vTemp,g_XMAddByte4); - // Fix y, z and w because they are too large - vTemp = _mm_mul_ps(vTemp,LoadByte4Mul); - return vTemp; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUNibble4 -( - const XMUNIBBLE4* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - float(pSource->v & 0xF), - float((pSource->v >> 4) & 0xF), - float((pSource->v >> 8) & 0xF), - float((pSource->v >> 12) & 0xF) - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000}; - static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f}; - uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast( pSource ) ); - uint32x4_t vInt = vmovl_u16( vInt16 ); - vInt = vandq_u32(vInt,UNibble4And); - float32x4_t R = vcvtq_f32_u32(vInt); - return vmulq_f32(R,UNibble4Mul); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000}; - static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f}; - // Get the 32 bit value and splat it - XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); - // Mask off x, y and z - vResult = _mm_and_ps(vResult,UNibble4And); - // Convert to float - vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); - // Normalize x, y, and z - vResult = _mm_mul_ps(vResult,UNibble4Mul); - return vResult; -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline XMVECTOR XM_CALLCONV PackedVector::XMLoadU555 -( - const XMU555* pSource -) -{ - assert(pSource); -#if defined(_XM_NO_INTRINSICS_) - XMVECTORF32 vResult = { - float(pSource->v & 0x1F), - float((pSource->v >> 5) & 0x1F), - float((pSource->v >> 10) & 0x1F), - float((pSource->v >> 15) & 0x1) - }; - return vResult.v; -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000}; - static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f}; - uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast( pSource ) ); - uint32x4_t vInt = vmovl_u16( vInt16 ); - vInt = vandq_u32(vInt,U555And); - float32x4_t R = vcvtq_f32_u32(vInt); - return vmulq_f32(R,U555Mul); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000}; - static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f}; - // Get the 32 bit value and splat it - XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); - // Mask off x, y and z - vResult = _mm_and_ps(vResult,U555And); - // Convert to float - vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); - // Normalize x, y, and z - vResult = _mm_mul_ps(vResult,U555Mul); - return vResult; -#endif -} - -#pragma prefast(pop) - -/**************************************************************************** - * - * Vector and matrix store operations - * - ****************************************************************************/ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreColor -( - XMCOLOR* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorSaturate(V); - N = XMVectorMultiply(N, g_UByteMax); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A( &tmp, N ); - - pDestination->c = ((uint32_t)tmp.w << 24) | - ((uint32_t)tmp.x << 16) | - ((uint32_t)tmp.y << 8) | - ((uint32_t)tmp.z); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) ); - R = vminq_f32(R, vdupq_n_f32(1.0f)); - R = vmulq_n_f32( R, 255.0f ); - R = XMVectorRound(R); - uint32x4_t vInt32 = vcvtq_u32_f32(R); - uint16x4_t vInt16 = vqmovn_u32( vInt32 ); - uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); - uint32_t rgba = vget_lane_u32( vreinterpret_u32_u8(vInt8), 0 ); - pDestination->c = (rgba & 0xFF00FF00) | ((rgba >> 16) & 0xFF) | ((rgba << 16) & 0xFF0000); -#elif defined(_XM_SSE_INTRINSICS_) - // Set <0 to 0 - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - // Set>1 to 1 - vResult = _mm_min_ps(vResult,g_XMOne); - // Convert to 0-255 - vResult = _mm_mul_ps(vResult,g_UByteMax); - // Shuffle RGBA to ARGB - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); - // Convert to int - __m128i vInt = _mm_cvtps_epi32(vResult); - // Mash to shorts - vInt = _mm_packs_epi32(vInt,vInt); - // Mash to bytes - vInt = _mm_packus_epi16(vInt,vInt); - // Store the color - _mm_store_ss(reinterpret_cast(&pDestination->c),_mm_castsi128_ps(vInt)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreHalf2 -( - XMHALF2* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - __m128i V1 = _mm_cvtps_ph( V, 0 ); - _mm_store_ss( reinterpret_cast(pDestination), _mm_castsi128_ps(V1) ); -#else - pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V)); - pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V)); -#endif // !_XM_F16C_INTRINSICS_ -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreShortN2 -( - XMSHORTN2* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); - N = XMVectorMultiply(N, g_ShortMax); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A( &tmp, N ); - - pDestination->x = (int16_t)tmp.x; - pDestination->y = (int16_t)tmp.y; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) ); - R = vminq_f32(R, vdupq_n_f32(1.0f)); - R = vmulq_n_f32( R, 32767.0f ); - int32x4_t vInt32 = vcvtq_s32_f32(R); - int16x4_t vInt16 = vqmovn_s32( vInt32 ); - vst1_lane_u32( &pDestination->v, vreinterpret_u32_s16(vInt16), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); - vResult = _mm_min_ps(vResult,g_XMOne); - vResult = _mm_mul_ps(vResult,g_ShortMax); - __m128i vResulti = _mm_cvtps_epi32(vResult); - vResulti = _mm_packs_epi32(vResulti,vResulti); - _mm_store_ss(reinterpret_cast(&pDestination->x),_mm_castsi128_ps(vResulti)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreShort2 -( - XMSHORT2* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A( &tmp, N ); - - pDestination->x = (int16_t)tmp.x; - pDestination->y = (int16_t)tmp.y; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-32767.f) ); - R = vminq_f32(R, vdupq_n_f32(32767.0f)); - int32x4_t vInt32 = vcvtq_s32_f32(R); - int16x4_t vInt16 = vqmovn_s32( vInt32 ); - vst1_lane_u32( &pDestination->v, vreinterpret_u32_s16(vInt16), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - // Bounds check - XMVECTOR vResult = _mm_max_ps(V,g_ShortMin); - vResult = _mm_min_ps(vResult,g_ShortMax); - // Convert to int with rounding - __m128i vInt = _mm_cvtps_epi32(vResult); - // Pack the ints into shorts - vInt = _mm_packs_epi32(vInt,vInt); - _mm_store_ss(reinterpret_cast(&pDestination->x),_mm_castsi128_ps(vInt)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreUShortN2 -( - XMUSHORTN2* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorSaturate(V); - N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v); - N = XMVectorTruncate(N); - - XMFLOAT4A tmp; - XMStoreFloat4A( &tmp, N ); - - pDestination->x = (int16_t)tmp.x; - pDestination->y = (int16_t)tmp.y; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) ); - R = vminq_f32(R, vdupq_n_f32(1.0f)); - R = vmulq_n_f32( R, 65535.0f ); - R = vaddq_f32( R, g_XMOneHalf ); - uint32x4_t vInt32 = vcvtq_u32_f32(R); - uint16x4_t vInt16 = vqmovn_u32( vInt32 ); - vst1_lane_u32( &pDestination->v, vreinterpret_u32_u16(vInt16), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - // Bounds check - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,g_XMOne); - vResult = _mm_mul_ps(vResult,g_UShortMax); - vResult = _mm_add_ps(vResult,g_XMOneHalf); - // Convert to int - __m128i vInt = _mm_cvttps_epi32(vResult); - // Since the SSE pack instruction clamps using signed rules, - // manually extract the values to store them to memory - pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); - pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreUShort2 -( - XMUSHORT2* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A( &tmp, N ); - - pDestination->x = (int16_t)tmp.x; - pDestination->y = (int16_t)tmp.y; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) ); - R = vminq_f32(R, vdupq_n_f32(65535.0f)); - uint32x4_t vInt32 = vcvtq_u32_f32(R); - uint16x4_t vInt16 = vqmovn_u32( vInt32 ); - vst1_lane_u32( &pDestination->v, vreinterpret_u32_u16(vInt16), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - // Bounds check - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,g_UShortMax); - // Convert to int with rounding - __m128i vInt = _mm_cvtps_epi32(vResult); - // Since the SSE pack instruction clamps using signed rules, - // manually extract the values to store them to memory - pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); - pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreByteN2 -( - XMBYTEN2* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); - N = XMVectorMultiply(N, g_ByteMax); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A( &tmp, N ); - - pDestination->x = (int8_t)tmp.x; - pDestination->y = (int8_t)tmp.y; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) ); - R = vminq_f32(R, vdupq_n_f32(1.0f)); - R = vmulq_n_f32( R, 127.0f ); - int32x4_t vInt32 = vcvtq_s32_f32(R); - int16x4_t vInt16 = vqmovn_s32( vInt32 ); - int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) ); - vst1_lane_u16( reinterpret_cast( pDestination ), vreinterpret_u16_s8(vInt8), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); - vResult = _mm_min_ps(vResult,g_XMOne); - // Scale by multiplication - vResult = _mm_mul_ps(vResult,g_ByteMax); - // Convert to int by rounding - __m128i vInt = _mm_cvtps_epi32(vResult); - // No SSE operations will write to 16-bit values, so we have to extract them manually - uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); - uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); - pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreByte2 -( - XMBYTE2* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A( &tmp, N ); - - pDestination->x = (int8_t)tmp.x; - pDestination->y = (int8_t)tmp.y; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f) ); - R = vminq_f32(R, vdupq_n_f32(127.0f)); - int32x4_t vInt32 = vcvtq_s32_f32(R); - int16x4_t vInt16 = vqmovn_s32( vInt32 ); - int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) ); - vst1_lane_u16( reinterpret_cast( pDestination ), vreinterpret_u16_s8(vInt8), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,g_ByteMin); - vResult = _mm_min_ps(vResult,g_ByteMax); - // Convert to int by rounding - __m128i vInt = _mm_cvtps_epi32(vResult); - // No SSE operations will write to 16-bit values, so we have to extract them manually - uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); - uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); - pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreUByteN2 -( - XMUBYTEN2* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorSaturate(V); - N = XMVectorMultiplyAdd(N, g_UByteMax, g_XMOneHalf.v); - N = XMVectorTruncate(N); - - XMFLOAT4A tmp; - XMStoreFloat4A( &tmp, N ); - - pDestination->x = (uint8_t)tmp.x; - pDestination->y = (uint8_t)tmp.y; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) ); - R = vminq_f32(R, vdupq_n_f32(1.0f)); - R = vmulq_n_f32( R, 255.0f ); - R = vaddq_f32( R, g_XMOneHalf ); - uint32x4_t vInt32 = vcvtq_u32_f32(R); - uint16x4_t vInt16 = vqmovn_u32( vInt32 ); - uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); - vst1_lane_u16( reinterpret_cast( pDestination ), vreinterpret_u16_u8(vInt8), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,g_XMOne); - // Scale by multiplication - vResult = _mm_mul_ps(vResult,g_UByteMax); - vResult = _mm_add_ps(vResult,g_XMOneHalf); - // Convert to int - __m128i vInt = _mm_cvttps_epi32(vResult); - // No SSE operations will write to 16-bit values, so we have to extract them manually - uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); - uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); - pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreUByte2 -( - XMUBYTE2* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A( &tmp, N ); - - pDestination->x = (uint8_t)tmp.x; - pDestination->y = (uint8_t)tmp.y; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) ); - R = vminq_f32(R, vdupq_n_f32(255.0f)); - uint32x4_t vInt32 = vcvtq_u32_f32(R); - uint16x4_t vInt16 = vqmovn_u32( vInt32 ); - uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); - vst1_lane_u16( reinterpret_cast( pDestination ), vreinterpret_u16_u8(vInt8), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,g_UByteMax); - // Convert to int by rounding - __m128i vInt = _mm_cvtps_epi32(vResult); - // No SSE operations will write to 16-bit values, so we have to extract them manually - uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); - uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); - pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreU565 -( - XMU565* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - static const XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f}; - - XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A( &tmp, N ); - - pDestination->v = (((uint16_t)tmp.z & 0x1F) << 11) | - (((uint16_t)tmp.y & 0x3F) << 5) | - (((uint16_t)tmp.x & 0x1F)); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f}; - static const XMVECTORF32 Scale = {1.0f,32.f,32.f*64.f, 0.f }; - static const XMVECTORU32 Mask = {0x1F,0x3F<<5,0x1F<<11,0}; - float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0)); - vResult = vminq_f32(vResult,Max); - vResult = vmulq_f32(vResult,Scale); - uint32x4_t vResulti = vcvtq_u32_f32(vResult); - vResulti = vandq_u32(vResulti,Mask); - // Do a horizontal or of 4 entries - uint32x2_t vTemp = vget_low_u32(vResulti); - uint32x2_t vhi = vget_high_u32(vResulti); - vTemp = vorr_u32( vTemp, vhi ); - vTemp = vpadd_u32( vTemp, vTemp ); - vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f}; - // Bounds check - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,Max); - // Convert to int with rounding - __m128i vInt = _mm_cvtps_epi32(vResult); - // No SSE operations will write to 16-bit values, so we have to extract them manually - uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); - uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); - uint16_t z = static_cast(_mm_extract_epi16(vInt,4)); - pDestination->v = ((z & 0x1F) << 11) | - ((y & 0x3F) << 5) | - ((x & 0x1F)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreFloat3PK -( - XMFLOAT3PK* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); - - __declspec(align(16)) uint32_t IValue[4]; - XMStoreFloat3A( reinterpret_cast(&IValue), V ); - - uint32_t Result[3]; - - // X & Y Channels (5-bit exponent, 6-bit mantissa) - for(uint32_t j=0; j < 2; ++j) - { - uint32_t Sign = IValue[j] & 0x80000000; - uint32_t I = IValue[j] & 0x7FFFFFFF; - - if ((I & 0x7F800000) == 0x7F800000) - { - // INF or NAN - Result[j] = 0x7c0; - if (( I & 0x7FFFFF ) != 0) - { - Result[j] = 0x7c0 | (((I>>17)|(I>>11)|(I>>6)|(I))&0x3f); - } - else if ( Sign ) - { - // -INF is clamped to 0 since 3PK is positive only - Result[j] = 0; - } - } - else if ( Sign ) - { - // 3PK is positive only, so clamp to zero - Result[j] = 0; - } - else if (I > 0x477E0000U) - { - // The number is too large to be represented as a float11, set to max - Result[j] = 0x7BF; - } - else - { - if (I < 0x38800000U) - { - // The number is too small to be represented as a normalized float11 - // Convert it to a denormalized value. - uint32_t Shift = 113U - (I >> 23U); - I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; - } - else - { - // Rebias the exponent to represent the value as a normalized float11 - I += 0xC8000000U; - } - - Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU; - } - } - - // Z Channel (5-bit exponent, 5-bit mantissa) - uint32_t Sign = IValue[2] & 0x80000000; - uint32_t I = IValue[2] & 0x7FFFFFFF; - - if ((I & 0x7F800000) == 0x7F800000) - { - // INF or NAN - Result[2] = 0x3e0; - if ( I & 0x7FFFFF ) - { - Result[2] = 0x3e0 | (((I>>18)|(I>>13)|(I>>3)|(I))&0x1f); - } - else if ( Sign ) - { - // -INF is clamped to 0 since 3PK is positive only - Result[2] = 0; - } - } - else if ( Sign ) - { - // 3PK is positive only, so clamp to zero - Result[2] = 0; - } - else if (I > 0x477C0000U) - { - // The number is too large to be represented as a float10, set to max - Result[2] = 0x3df; - } - else - { - if (I < 0x38800000U) - { - // The number is too small to be represented as a normalized float10 - // Convert it to a denormalized value. - uint32_t Shift = 113U - (I >> 23U); - I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; - } - else - { - // Rebias the exponent to represent the value as a normalized float10 - I += 0xC8000000U; - } - - Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU; - } - - // Pack Result into memory - pDestination->v = (Result[0] & 0x7ff) - | ( (Result[1] & 0x7ff) << 11 ) - | ( (Result[2] & 0x3ff) << 22 ); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreFloat3SE -( - XMFLOAT3SE* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); - - XMFLOAT3A tmp; - XMStoreFloat3A( &tmp, V ); - - static const float maxf9 = float(0x1FF << 7); - static const float minf9 = float(1.f / (1 << 16)); - - float x = (tmp.x >= 0.f) ? ( (tmp.x > maxf9) ? maxf9 : tmp.x ) : 0.f; - float y = (tmp.y >= 0.f) ? ( (tmp.y > maxf9) ? maxf9 : tmp.y ) : 0.f; - float z = (tmp.z >= 0.f) ? ( (tmp.z > maxf9) ? maxf9 : tmp.z ) : 0.f; - - const float max_xy = (x > y) ? x : y; - const float max_xyz = (max_xy > z) ? max_xy : z; - - const float maxColor = (max_xyz > minf9) ? max_xyz : minf9; - - union { float f; int32_t i; } fi; - fi.f = maxColor; - fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1) - - uint32_t exp = fi.i >> 23; - pDestination->e = exp - 0x6f; - - fi.i = 0x83000000 - (exp << 23); - float ScaleR = fi.f; - -#ifdef _XM_NO_ROUNDF_ - pDestination->xm = static_cast( Internal::round_to_nearest(x * ScaleR) ); - pDestination->ym = static_cast( Internal::round_to_nearest(y * ScaleR) ); - pDestination->zm = static_cast( Internal::round_to_nearest(z * ScaleR) ); -#else - pDestination->xm = static_cast( lroundf(x * ScaleR) ); - pDestination->ym = static_cast( lroundf(y * ScaleR) ); - pDestination->zm = static_cast( lroundf(z * ScaleR) ); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreHalf4 -( - XMHALF4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) - __m128i V1 = _mm_cvtps_ph( V, 0 ); - _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 ); -#else - XMFLOAT4A t; - XMStoreFloat4A(&t, V ); - - pDestination->x = XMConvertFloatToHalf(t.x); - pDestination->y = XMConvertFloatToHalf(t.y); - pDestination->z = XMConvertFloatToHalf(t.z); - pDestination->w = XMConvertFloatToHalf(t.w); -#endif // !_XM_F16C_INTRINSICS_ -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreShortN4 -( - XMSHORTN4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); - N = XMVectorMultiply(N, g_ShortMax); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->x = (int16_t)tmp.x; - pDestination->y = (int16_t)tmp.y; - pDestination->z = (int16_t)tmp.z; - pDestination->w = (int16_t)tmp.w; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(-1.f) ); - vResult = vminq_f32( vResult, vdupq_n_f32(1.0f) ); - vResult = vmulq_n_f32( vResult, 32767.0f ); - vResult = vcvtq_s32_f32( vResult ); - int16x4_t vInt = vmovn_s32( vResult ); - vst1_s16( reinterpret_cast(pDestination), vInt ); -#elif defined(_XM_SSE_INTRINSICS_) - XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); - vResult = _mm_min_ps(vResult,g_XMOne); - vResult = _mm_mul_ps(vResult,g_ShortMax); - __m128i vResulti = _mm_cvtps_epi32(vResult); - vResulti = _mm_packs_epi32(vResulti,vResulti); - _mm_store_sd(reinterpret_cast(&pDestination->x),_mm_castsi128_pd(vResulti)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreShort4 -( - XMSHORT4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->x = (int16_t)tmp.x; - pDestination->y = (int16_t)tmp.y; - pDestination->z = (int16_t)tmp.z; - pDestination->w = (int16_t)tmp.w; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vResult = vmaxq_f32( V, g_ShortMin ); - vResult = vminq_f32( vResult, g_ShortMax ); - vResult = vcvtq_s32_f32( vResult ); - int16x4_t vInt = vmovn_s32( vResult ); - vst1_s16( reinterpret_cast(pDestination), vInt ); -#elif defined(_XM_SSE_INTRINSICS_) - // Bounds check - XMVECTOR vResult = _mm_max_ps(V,g_ShortMin); - vResult = _mm_min_ps(vResult,g_ShortMax); - // Convert to int with rounding - __m128i vInt = _mm_cvtps_epi32(vResult); - // Pack the ints into shorts - vInt = _mm_packs_epi32(vInt,vInt); - _mm_store_sd(reinterpret_cast(&pDestination->x),_mm_castsi128_pd(vInt)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreUShortN4 -( - XMUSHORTN4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorSaturate(V); - N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v); - N = XMVectorTruncate(N); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->x = (int16_t)tmp.x; - pDestination->y = (int16_t)tmp.y; - pDestination->z = (int16_t)tmp.z; - pDestination->w = (int16_t)tmp.w; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(0) ); - vResult = vminq_f32( vResult, vdupq_n_f32(1.0f) ); - vResult = vmulq_n_f32( vResult, 65535.0f ); - vResult = vaddq_f32( vResult, g_XMOneHalf ); - vResult = vcvtq_u32_f32( vResult ); - uint16x4_t vInt = vmovn_u32( vResult ); - vst1_u16( reinterpret_cast(pDestination), vInt ); -#elif defined(_XM_SSE_INTRINSICS_) - // Bounds check - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,g_XMOne); - vResult = _mm_mul_ps(vResult,g_UShortMax); - vResult = _mm_add_ps(vResult,g_XMOneHalf); - // Convert to int - __m128i vInt = _mm_cvttps_epi32(vResult); - // Since the SSE pack instruction clamps using signed rules, - // manually extract the values to store them to memory - pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); - pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); - pDestination->z = static_cast(_mm_extract_epi16(vInt,4)); - pDestination->w = static_cast(_mm_extract_epi16(vInt,6)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreUShort4 -( - XMUSHORT4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->x = (int16_t)tmp.x; - pDestination->y = (int16_t)tmp.y; - pDestination->z = (int16_t)tmp.z; - pDestination->w = (int16_t)tmp.w; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(0) ); - vResult = vminq_f32( vResult, g_UShortMax ); - vResult = vcvtq_u32_f32( vResult ); - uint16x4_t vInt = vmovn_u32( vResult ); - vst1_u16( reinterpret_cast(pDestination), vInt ); -#elif defined(_XM_SSE_INTRINSICS_) - // Bounds check - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,g_UShortMax); - // Convert to int with rounding - __m128i vInt = _mm_cvtps_epi32(vResult); - // Since the SSE pack instruction clamps using signed rules, - // manually extract the values to store them to memory - pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); - pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); - pDestination->z = static_cast(_mm_extract_epi16(vInt,4)); - pDestination->w = static_cast(_mm_extract_epi16(vInt,6)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreXDecN4 -( - XMXDECN4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f}; - static const XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 3.0f}; - - XMVECTOR N = XMVectorClamp(V, Min.v, g_XMOne.v); - N = XMVectorMultiply(N, Scale.v); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->v = ((uint32_t)tmp.w << 30) | - (((int32_t)tmp.z & 0x3FF) << 20) | - (((int32_t)tmp.y & 0x3FF) << 10) | - (((int32_t)tmp.x & 0x3FF)); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f}; - static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f}; - static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29}; - float32x4_t vResult = vmaxq_f32(V,Min); - vResult = vminq_f32(vResult,vdupq_n_f32(1.0f)); - vResult = vmulq_f32(vResult,Scale); - int32x4_t vResulti = vcvtq_s32_f32(vResult); - vResulti = vandq_s32(vResulti,ScaleMask); - int32x4_t vResultw = vandq_s32(vResulti,g_XMMaskW); - vResulti = vaddq_s32(vResulti,vResultw); - // Do a horizontal or of all 4 entries - uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti)); - uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti)); - vTemp = vorr_u32( vTemp, vhi ); - vTemp = vpadd_u32( vTemp, vTemp ); - vst1_lane_u32( &pDestination->v, vTemp, 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f}; - static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f}; - static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29}; - XMVECTOR vResult = _mm_max_ps(V,Min); - vResult = _mm_min_ps(vResult,g_XMOne); - // Scale by multiplication - vResult = _mm_mul_ps(vResult,Scale); - // Convert to int (W is unsigned) - __m128i vResulti = _mm_cvtps_epi32(vResult); - // Mask off any fraction - vResulti = _mm_and_si128(vResulti,ScaleMask); - // To fix W, add itself to shift it up to <<30 instead of <<29 - __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW); - vResulti = _mm_add_epi32(vResulti,vResultw); - // Do a horizontal or of all 4 entries - vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti),_MM_SHUFFLE(0,3,2,1)); - vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult)); - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1)); - vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult)); - vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1)); - vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult)); - _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); -#endif -} - -//------------------------------------------------------------------------------ -#pragma warning(push) -#pragma warning(disable : 4996) -// C4996: ignore deprecation warning - -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreXDec4 -( - XMXDEC4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - static const XMVECTORF32 Min = {-511.0f, -511.0f, -511.0f, 0.0f}; - static const XMVECTORF32 Max = {511.0f, 511.0f, 511.0f, 3.0f}; - - XMVECTOR N = XMVectorClamp(V, Min, Max); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->v = ((uint32_t)tmp.w << 30) | - (((int32_t)tmp.z & 0x3FF) << 20) | - (((int32_t)tmp.y & 0x3FF) << 10) | - (((int32_t)tmp.x & 0x3FF)); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f}; - static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f}; - static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f}; - static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; - float32x4_t vResult = vmaxq_f32(V,MinXDec4); - vResult = vminq_f32(vResult,MaxXDec4); - vResult = vmulq_f32(vResult,ScaleXDec4); - int32x4_t vResulti = vcvtq_s32_f32(vResult); - vResulti = vandq_s32(vResulti,MaskXDec4); - // Do a horizontal or of 4 entries - uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti)); - uint32x2_t vTemp2 = vget_high_u32(vreinterpret_u32_s32(vResulti)); - vTemp = vorr_u32( vTemp, vTemp2 ); - // Perform a single bit left shift on y|w - vTemp2 = vdup_lane_u32( vTemp, 1 ); - vTemp2 = vadd_s32( vTemp2, vTemp2 ); - vTemp = vorr_u32( vTemp, vTemp2 ); - vst1_lane_u32( &pDestination->v, vTemp, 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f}; - static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f}; - static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f}; - static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,MinXDec4); - vResult = _mm_min_ps(vResult,MaxXDec4); - // Scale by multiplication - vResult = _mm_mul_ps(vResult,ScaleXDec4); - // Convert to int - __m128i vResulti = _mm_cvttps_epi32(vResult); - // Mask off any fraction - vResulti = _mm_and_si128(vResulti,MaskXDec4); - // Do a horizontal or of 4 entries - __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); - // x = x|z, y = y|w - vResulti = _mm_or_si128(vResulti,vResulti2); - // Move Z to the x position - vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); - // Perform a single bit left shift on y|w - vResulti2 = _mm_add_epi32(vResulti2,vResulti2); - // i = x|y|z|w - vResulti = _mm_or_si128(vResulti,vResulti2); - _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); -#endif -} - -#pragma warning(pop) - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreUDecN4 -( - XMUDECN4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - static const XMVECTORF32 Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f}; - - XMVECTOR N = XMVectorSaturate(V); - N = XMVectorMultiply(N, Scale.v); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->v = ((uint32_t)tmp.w << 30) | - (((uint32_t)tmp.z & 0x3FF) << 20) | - (((uint32_t)tmp.y & 0x3FF) << 10) | - (((uint32_t)tmp.x & 0x3FF)); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f}; - static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; - float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0.f)); - vResult = vminq_f32(vResult,vdupq_n_f32(1.f)); - vResult = vmulq_f32(vResult,ScaleUDecN4); - uint32x4_t vResulti = vcvtq_u32_f32(vResult); - vResulti = vandq_u32(vResulti,MaskUDecN4); - // Do a horizontal or of 4 entries - uint32x2_t vTemp = vget_low_u32(vResulti); - uint32x2_t vTemp2 = vget_high_u32(vResulti); - vTemp = vorr_u32( vTemp, vTemp2 ); - // Perform a single bit left shift on y|w - vTemp2 = vdup_lane_u32( vTemp, 1 ); - vTemp2 = vadd_u32( vTemp2, vTemp2 ); - vTemp = vorr_u32( vTemp, vTemp2 ); - vst1_lane_u32( &pDestination->v, vTemp, 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f}; - static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,g_XMOne); - // Scale by multiplication - vResult = _mm_mul_ps(vResult,ScaleUDecN4); - // Convert to int - __m128i vResulti = _mm_cvttps_epi32(vResult); - // Mask off any fraction - vResulti = _mm_and_si128(vResulti,MaskUDecN4); - // Do a horizontal or of 4 entries - __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); - // x = x|z, y = y|w - vResulti = _mm_or_si128(vResulti,vResulti2); - // Move Z to the x position - vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); - // Perform a left shift by one bit on y|w - vResulti2 = _mm_add_epi32(vResulti2,vResulti2); - // i = x|y|z|w - vResulti = _mm_or_si128(vResulti,vResulti2); - _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreUDecN4_XR -( - XMUDECN4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - static const XMVECTORF32 Scale = { 510.0f, 510.0f, 510.0f, 3.0f }; - static const XMVECTORF32 Bias = { 384.0f, 384.0f, 384.0f, 0.0f }; - static const XMVECTORF32 C = { 1023.f, 1023.f, 1023.f, 3.f }; - - XMVECTOR N = XMVectorMultiplyAdd( V, Scale, Bias ); - N = XMVectorClamp( N, g_XMZero, C ); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->v = ((uint32_t)tmp.w << 30) - | (((uint32_t)tmp.z & 0x3FF) << 20) - | (((uint32_t)tmp.y & 0x3FF) << 10) - | (((uint32_t)tmp.x & 0x3FF)); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 Shift = {1.0f,1024.0f*0.5f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f*0.5f}; - static const XMVECTORU32 MaskUDecN4 = {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; - static const XMVECTORF32 Scale = { 510.0f, 510.0f, 510.0f, 3.0f }; - static const XMVECTORF32 Bias = { 384.0f, 384.0f, 384.0f, 0.0f }; - static const XMVECTORF32 C = { 1023.f, 1023.f, 1023.f, 3.f }; - float32x4_t vResult = vmlaq_f32( Bias, V, Scale ); - vResult = vmaxq_f32(vResult,vdupq_n_f32(0.f)); - vResult = vminq_f32(vResult,C); - vResult = vmulq_f32(vResult,Shift); - uint32x4_t vResulti = vcvtq_u32_f32(vResult); - vResulti = vandq_u32(vResulti,MaskUDecN4); - // Do a horizontal or of 4 entries - uint32x2_t vTemp = vget_low_u32(vResulti); - uint32x2_t vTemp2 = vget_high_u32(vResulti); - vTemp = vorr_u32( vTemp, vTemp2 ); - // Perform a single bit left shift on y|w - vTemp2 = vdup_lane_u32( vTemp, 1 ); - vTemp2 = vadd_u32( vTemp2, vTemp2 ); - vTemp = vorr_u32( vTemp, vTemp2 ); - vst1_lane_u32( &pDestination->v, vTemp, 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Shift = {1.0f,1024.0f*0.5f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f*0.5f}; - static const XMVECTORU32 MaskUDecN4 = {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; - static const XMVECTORF32 Scale = { 510.0f, 510.0f, 510.0f, 3.0f }; - static const XMVECTORF32 Bias = { 384.0f, 384.0f, 384.0f, 0.0f }; - static const XMVECTORF32 C = { 1023.f, 1023.f, 1023.f, 3.f }; - // Scale & bias - XMVECTOR vResult = _mm_mul_ps( V, Scale ); - vResult = _mm_add_ps( vResult, Bias ); - // Clamp to bounds - vResult = _mm_max_ps(vResult,g_XMZero); - vResult = _mm_min_ps(vResult,C); - // Scale by shift values - vResult = _mm_mul_ps(vResult,Shift); - // Convert to int - __m128i vResulti = _mm_cvttps_epi32(vResult); - // Mask off any fraction - vResulti = _mm_and_si128(vResulti,MaskUDecN4); - // Do a horizontal or of 4 entries - __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); - // x = x|z, y = y|w - vResulti = _mm_or_si128(vResulti,vResulti2); - // Move Z to the x position - vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); - // Perform a left shift by one bit on y|w - vResulti2 = _mm_add_epi32(vResulti2,vResulti2); - // i = x|y|z|w - vResulti = _mm_or_si128(vResulti,vResulti2); - _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreUDec4 -( - XMUDEC4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - static const XMVECTORF32 Max = {1023.0f, 1023.0f, 1023.0f, 3.0f}; - - XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->v = ((uint32_t)tmp.w << 30) | - (((uint32_t)tmp.z & 0x3FF) << 20) | - (((uint32_t)tmp.y & 0x3FF) << 10) | - (((uint32_t)tmp.x & 0x3FF)); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f}; - static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f}; - static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; - float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0.f)); - vResult = vminq_f32(vResult,MaxUDec4); - vResult = vmulq_f32(vResult,ScaleUDec4); - uint32x4_t vResulti = vcvtq_u32_f32(vResult); - vResulti = vandq_u32(vResulti,MaskUDec4); - // Do a horizontal or of 4 entries - uint32x2_t vTemp = vget_low_u32(vResulti); - uint32x2_t vTemp2 = vget_high_u32(vResulti); - vTemp = vorr_u32( vTemp, vTemp2 ); - // Perform a single bit left shift on y|w - vTemp2 = vdup_lane_u32( vTemp, 1 ); - vTemp2 = vadd_u32( vTemp2, vTemp2 ); - vTemp = vorr_u32( vTemp, vTemp2 ); - vst1_lane_u32( &pDestination->v, vTemp, 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f}; - static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f}; - static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,MaxUDec4); - // Scale by multiplication - vResult = _mm_mul_ps(vResult,ScaleUDec4); - // Convert to int - __m128i vResulti = _mm_cvttps_epi32(vResult); - // Mask off any fraction - vResulti = _mm_and_si128(vResulti,MaskUDec4); - // Do a horizontal or of 4 entries - __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); - // x = x|z, y = y|w - vResulti = _mm_or_si128(vResulti,vResulti2); - // Move Z to the x position - vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); - // Perform a left shift by one bit on y|w - vResulti2 = _mm_add_epi32(vResulti2,vResulti2); - // i = x|y|z|w - vResulti = _mm_or_si128(vResulti,vResulti2); - _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); -#endif -} - -//------------------------------------------------------------------------------ -#pragma warning(push) -#pragma warning(disable : 4996) -// C4996: ignore deprecation warning - -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreDecN4 -( - XMDECN4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - static const XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 1.0f}; - - XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); - N = XMVectorMultiply(N, Scale.v); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->v = ((int32_t)tmp.w << 30) | - (((int32_t)tmp.z & 0x3FF) << 20) | - (((int32_t)tmp.y & 0x3FF) << 10) | - (((int32_t)tmp.x & 0x3FF)); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f}; - float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(-1.f)); - vResult = vminq_f32(vResult,vdupq_n_f32(1.f)); - vResult = vmulq_f32(vResult,ScaleDecN4); - int32x4_t vResulti = vcvtq_s32_f32(vResult); - vResulti = vandq_s32(vResulti,g_XMMaskDec4); - // Do a horizontal or of 4 entries - uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti)); - uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti)); - vTemp = vorr_u32( vTemp, vhi ); - vTemp = vpadd_u32( vTemp, vTemp ); - vst1_lane_u32( &pDestination->v, vTemp, 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f}; - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); - vResult = _mm_min_ps(vResult,g_XMOne); - // Scale by multiplication - vResult = _mm_mul_ps(vResult,ScaleDecN4); - // Convert to int - __m128i vResulti = _mm_cvttps_epi32(vResult); - // Mask off any fraction - vResulti = _mm_and_si128(vResulti,g_XMMaskDec4); - // Do a horizontal or of 4 entries - __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); - // x = x|z, y = y|w - vResulti = _mm_or_si128(vResulti,vResulti2); - // Move Z to the x position - vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); - // i = x|y|z|w - vResulti = _mm_or_si128(vResulti,vResulti2); - _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreDec4 -( - XMDEC4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - static const XMVECTORF32 Min = {-511.0f, -511.0f, -511.0f, -1.0f}; - static const XMVECTORF32 Max = {511.0f, 511.0f, 511.0f, 1.0f}; - - XMVECTOR N = XMVectorClamp(V, Min, Max); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->v = ((int32_t)tmp.w << 30) | - (((int32_t)tmp.z & 0x3FF) << 20) | - (((int32_t)tmp.y & 0x3FF) << 10) | - (((int32_t)tmp.x & 0x3FF)); - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f}; - static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f}; - static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f}; - float32x4_t vResult = vmaxq_f32(V,MinDec4); - vResult = vminq_f32(vResult,MaxDec4); - vResult = vmulq_f32(vResult,ScaleDec4); - int32x4_t vResulti = vcvtq_s32_f32(vResult); - vResulti = vandq_s32(vResulti,g_XMMaskDec4); - // Do a horizontal or of all 4 entries - uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti)); - uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti)); - vTemp = vorr_u32( vTemp, vhi ); - vTemp = vpadd_u32( vTemp, vTemp ); - vst1_lane_u32( &pDestination->v, vTemp, 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f}; - static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f}; - static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f}; - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,MinDec4); - vResult = _mm_min_ps(vResult,MaxDec4); - // Scale by multiplication - vResult = _mm_mul_ps(vResult,ScaleDec4); - // Convert to int - __m128i vResulti = _mm_cvttps_epi32(vResult); - // Mask off any fraction - vResulti = _mm_and_si128(vResulti,g_XMMaskDec4); - // Do a horizontal or of 4 entries - __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); - // x = x|z, y = y|w - vResulti = _mm_or_si128(vResulti,vResulti2); - // Move Z to the x position - vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); - // i = x|y|z|w - vResulti = _mm_or_si128(vResulti,vResulti2); - _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); -#endif -} - -#pragma warning(pop) - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreUByteN4 -( - XMUBYTEN4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorSaturate(V); - N = XMVectorMultiply(N, g_UByteMax); - N = XMVectorTruncate(N); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->x = (uint8_t)tmp.x; - pDestination->y = (uint8_t)tmp.y; - pDestination->z = (uint8_t)tmp.z; - pDestination->w = (uint8_t)tmp.w; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) ); - R = vminq_f32(R, vdupq_n_f32(1.0f)); - R = vmulq_n_f32( R, 255.0f ); - uint32x4_t vInt32 = vcvtq_u32_f32(R); - uint16x4_t vInt16 = vqmovn_u32( vInt32 ); - uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); - vst1_lane_u32( &pDestination->v, vreinterpret_u32_u8(vInt8), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f*256.0f*0.5f,255.0f*256.0f*256.0f,255.0f*256.0f*256.0f*256.0f*0.5f}; - static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)}; - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,g_XMOne); - // Scale by multiplication - vResult = _mm_mul_ps(vResult,ScaleUByteN4); - // Convert to int - __m128i vResulti = _mm_cvttps_epi32(vResult); - // Mask off any fraction - vResulti = _mm_and_si128(vResulti,MaskUByteN4); - // Do a horizontal or of 4 entries - __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); - // x = x|z, y = y|w - vResulti = _mm_or_si128(vResulti,vResulti2); - // Move Z to the x position - vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); - // Perform a single bit left shift to fix y|w - vResulti2 = _mm_add_epi32(vResulti2,vResulti2); - // i = x|y|z|w - vResulti = _mm_or_si128(vResulti,vResulti2); - _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreUByte4 -( - XMUBYTE4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->x = (uint8_t)tmp.x; - pDestination->y = (uint8_t)tmp.y; - pDestination->z = (uint8_t)tmp.z; - pDestination->w = (uint8_t)tmp.w; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) ); - R = vminq_f32(R, vdupq_n_f32(255.0f)); - uint32x4_t vInt32 = vcvtq_u32_f32(R); - uint16x4_t vInt16 = vqmovn_u32( vInt32 ); - uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); - vst1_lane_u32( &pDestination->v, vreinterpret_u32_u8(vInt8), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f*0.5f,256.0f*256.0f,256.0f*256.0f*256.0f*0.5f}; - static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)}; - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,g_UByteMax); - // Scale by multiplication - vResult = _mm_mul_ps(vResult,ScaleUByte4); - // Convert to int by rounding - __m128i vResulti = _mm_cvtps_epi32(vResult); - // Mask off any fraction - vResulti = _mm_and_si128(vResulti,MaskUByte4); - // Do a horizontal or of 4 entries - __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); - // x = x|z, y = y|w - vResulti = _mm_or_si128(vResulti,vResulti2); - // Move Z to the x position - vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); - // Perform a single bit left shift to fix y|w - vResulti2 = _mm_add_epi32(vResulti2,vResulti2); - // i = x|y|z|w - vResulti = _mm_or_si128(vResulti,vResulti2); - _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreByteN4 -( - XMBYTEN4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); - N = XMVectorMultiply(V, g_ByteMax); - N = XMVectorTruncate(N); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->x = (int8_t)tmp.x; - pDestination->y = (int8_t)tmp.y; - pDestination->z = (int8_t)tmp.z; - pDestination->w = (int8_t)tmp.w; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) ); - R = vminq_f32(R, vdupq_n_f32(1.0f)); - R = vmulq_n_f32( R, 127.0f ); - int32x4_t vInt32 = vcvtq_s32_f32(R); - int16x4_t vInt16 = vqmovn_s32( vInt32 ); - int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) ); - vst1_lane_u32( &pDestination->v, vreinterpret_u32_s8(vInt8), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f*256.0f,127.0f*256.0f*256.0f,127.0f*256.0f*256.0f*256.0f}; - static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24}; - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); - vResult = _mm_min_ps(vResult,g_XMOne); - // Scale by multiplication - vResult = _mm_mul_ps(vResult,ScaleByteN4); - // Convert to int - __m128i vResulti = _mm_cvttps_epi32(vResult); - // Mask off any fraction - vResulti = _mm_and_si128(vResulti,MaskByteN4); - // Do a horizontal or of 4 entries - __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); - // x = x|z, y = y|w - vResulti = _mm_or_si128(vResulti,vResulti2); - // Move Z to the x position - vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); - // i = x|y|z|w - vResulti = _mm_or_si128(vResulti,vResulti2); - _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreByte4 -( - XMBYTE4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - - XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->x = (int8_t)tmp.x; - pDestination->y = (int8_t)tmp.y; - pDestination->z = (int8_t)tmp.z; - pDestination->w = (int8_t)tmp.w; - -#elif defined(_XM_ARM_NEON_INTRINSICS_) - float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f) ); - R = vminq_f32(R, vdupq_n_f32(127.f)); - int32x4_t vInt32 = vcvtq_s32_f32(R); - int16x4_t vInt16 = vqmovn_s32( vInt32 ); - int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) ); - vst1_lane_u32( &pDestination->v, vreinterpret_u32_s8(vInt8), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f*256.0f,256.0f*256.0f*256.0f}; - static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24}; - // Clamp to bounds - XMVECTOR vResult = _mm_max_ps(V,g_ByteMin); - vResult = _mm_min_ps(vResult,g_ByteMax); - // Scale by multiplication - vResult = _mm_mul_ps(vResult,ScaleByte4); - // Convert to int by rounding - __m128i vResulti = _mm_cvtps_epi32(vResult); - // Mask off any fraction - vResulti = _mm_and_si128(vResulti,MaskByte4); - // Do a horizontal or of 4 entries - __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); - // x = x|z, y = y|w - vResulti = _mm_or_si128(vResulti,vResulti2); - // Move Z to the x position - vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); - // i = x|y|z|w - vResulti = _mm_or_si128(vResulti,vResulti2); - _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreUNibble4 -( - XMUNIBBLE4* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - static const XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f}; - - XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->v = (((uint16_t)tmp.w & 0xF) << 12) | - (((uint16_t)tmp.z & 0xF) << 8) | - (((uint16_t)tmp.y & 0xF) << 4) | - (((uint16_t)tmp.x & 0xF)); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f}; - static const XMVECTORF32 Scale = {1.0f,16.f,16.f*16.f,16.f*16.f*16.f}; - static const XMVECTORU32 Mask = {0xF,0xF<<4,0xF<<8,0xF<<12}; - float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0)); - vResult = vminq_f32(vResult,Max); - vResult = vmulq_f32(vResult,Scale); - uint32x4_t vResulti = vcvtq_u32_f32(vResult); - vResulti = vandq_u32(vResulti,Mask); - // Do a horizontal or of 4 entries - uint32x2_t vTemp = vget_low_u32(vResulti); - uint32x2_t vhi = vget_high_u32(vResulti); - vTemp = vorr_u32( vTemp, vhi ); - vTemp = vpadd_u32( vTemp, vTemp ); - vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f}; - // Bounds check - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,Max); - // Convert to int with rounding - __m128i vInt = _mm_cvtps_epi32(vResult); - // No SSE operations will write to 16-bit values, so we have to extract them manually - uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); - uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); - uint16_t z = static_cast(_mm_extract_epi16(vInt,4)); - uint16_t w = static_cast(_mm_extract_epi16(vInt,6)); - pDestination->v = ((w & 0xF) << 12) | - ((z & 0xF) << 8) | - ((y & 0xF) << 4) | - ((x & 0xF)); -#endif -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline void XM_CALLCONV PackedVector::XMStoreU555 -( - XMU555* pDestination, - FXMVECTOR V -) -{ - assert(pDestination); -#if defined(_XM_NO_INTRINSICS_) - static const XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f}; - - XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); - N = XMVectorRound(N); - - XMFLOAT4A tmp; - XMStoreFloat4A(&tmp, N ); - - pDestination->v = ((tmp.w > 0.f) ? 0x8000 : 0) | - (((uint16_t)tmp.z & 0x1F) << 10) | - (((uint16_t)tmp.y & 0x1F) << 5) | - (((uint16_t)tmp.x & 0x1F)); -#elif defined(_XM_ARM_NEON_INTRINSICS_) - static const XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f}; - static const XMVECTORF32 Scale = {1.0f,32.f/2.f,32.f*32.f,32.f*32.f*32.f/2.f}; - static const XMVECTORU32 Mask = {0x1F,0x1F<<(5-1),0x1F<<10,0x1<<(15-1)}; - float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0)); - vResult = vminq_f32(vResult,Max); - vResult = vmulq_f32(vResult,Scale); - uint32x4_t vResulti = vcvtq_u32_f32(vResult); - vResulti = vandq_u32(vResulti,Mask); - // Do a horizontal or of 4 entries - uint32x2_t vTemp = vget_low_u32(vResulti); - uint32x2_t vTemp2 = vget_high_u32(vResulti); - vTemp = vorr_u32( vTemp, vTemp2 ); - // Perform a single bit left shift on y|w - vTemp2 = vdup_lane_u32( vTemp, 1 ); - vTemp2 = vadd_s32( vTemp2, vTemp2 ); - vTemp = vorr_u32( vTemp, vTemp2 ); - vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 ); -#elif defined(_XM_SSE_INTRINSICS_) - static const XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f}; - // Bounds check - XMVECTOR vResult = _mm_max_ps(V,g_XMZero); - vResult = _mm_min_ps(vResult,Max); - // Convert to int with rounding - __m128i vInt = _mm_cvtps_epi32(vResult); - // No SSE operations will write to 16-bit values, so we have to extract them manually - uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); - uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); - uint16_t z = static_cast(_mm_extract_epi16(vInt,4)); - uint16_t w = static_cast(_mm_extract_epi16(vInt,6)); - pDestination->v = ((w) ? 0x8000 : 0) | - ((z & 0x1F) << 10) | - ((y & 0x1F) << 5) | - ((x & 0x1F)); -#endif -} - - -/**************************************************************************** - * - * XMCOLOR operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMCOLOR::XMCOLOR -( - float _r, - float _g, - float _b, - float _a -) -{ - XMStoreColor(this, XMVectorSet(_r, _g, _b, _a)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMCOLOR::XMCOLOR -( - const float* pArray -) -{ - XMStoreColor(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMHALF2 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMHALF2::XMHALF2 -( - float _x, - float _y -) -{ - x = XMConvertFloatToHalf(_x); - y = XMConvertFloatToHalf(_y); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMHALF2::XMHALF2 -( - const float* pArray -) -{ - assert( pArray != nullptr ); - x = XMConvertFloatToHalf(pArray[0]); - y = XMConvertFloatToHalf(pArray[1]); -} - -/**************************************************************************** - * - * XMSHORTN2 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMSHORTN2::XMSHORTN2 -( - float _x, - float _y -) -{ - XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMSHORTN2::XMSHORTN2 -( - const float* pArray -) -{ - XMStoreShortN2(this, XMLoadFloat2(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMSHORT2 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMSHORT2::XMSHORT2 -( - float _x, - float _y -) -{ - XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMSHORT2::XMSHORT2 -( - const float* pArray -) -{ - XMStoreShort2(this, XMLoadFloat2(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMUSHORTN2 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMUSHORTN2::XMUSHORTN2 -( - float _x, - float _y -) -{ - XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMUSHORTN2::XMUSHORTN2 -( - const float* pArray -) -{ - XMStoreUShortN2(this, XMLoadFloat2(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMUSHORT2 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMUSHORT2::XMUSHORT2 -( - float _x, - float _y -) -{ - XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMUSHORT2::XMUSHORT2 -( - const float* pArray -) -{ - XMStoreUShort2(this, XMLoadFloat2(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMBYTEN2 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMBYTEN2::XMBYTEN2 -( - float _x, - float _y -) -{ - XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMBYTEN2::XMBYTEN2 -( - const float* pArray -) -{ - XMStoreByteN2(this, XMLoadFloat2(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMBYTE2 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMBYTE2::XMBYTE2 -( - float _x, - float _y -) -{ - XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMBYTE2::XMBYTE2 -( - const float* pArray -) -{ - XMStoreByte2(this, XMLoadFloat2(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMUBYTEN2 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMUBYTEN2::XMUBYTEN2 -( - float _x, - float _y -) -{ - XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMUBYTEN2::XMUBYTEN2 -( - const float* pArray -) -{ - XMStoreUByteN2(this, XMLoadFloat2(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMUBYTE2 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMUBYTE2::XMUBYTE2 -( - float _x, - float _y -) -{ - XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMUBYTE2::XMUBYTE2 -( - const float* pArray -) -{ - XMStoreUByte2(this, XMLoadFloat2(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMU565 operators - * - ****************************************************************************/ - -inline PackedVector::XMU565::XMU565 -( - float _x, - float _y, - float _z -) -{ - XMStoreU565(this, XMVectorSet( _x, _y, _z, 0.0f )); -} - -_Use_decl_annotations_ -inline PackedVector::XMU565::XMU565 -( - const float *pArray -) -{ - XMStoreU565(this, XMLoadFloat3(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMFLOAT3PK operators - * - ****************************************************************************/ - -inline PackedVector::XMFLOAT3PK::XMFLOAT3PK -( - float _x, - float _y, - float _z -) -{ - XMStoreFloat3PK(this, XMVectorSet( _x, _y, _z, 0.0f )); -} - -_Use_decl_annotations_ -inline PackedVector::XMFLOAT3PK::XMFLOAT3PK -( - const float *pArray -) -{ - XMStoreFloat3PK(this, XMLoadFloat3(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMFLOAT3SE operators - * - ****************************************************************************/ - -inline PackedVector::XMFLOAT3SE::XMFLOAT3SE -( - float _x, - float _y, - float _z -) -{ - XMStoreFloat3SE(this, XMVectorSet( _x, _y, _z, 0.0f )); -} - -_Use_decl_annotations_ -inline PackedVector::XMFLOAT3SE::XMFLOAT3SE -( - const float *pArray -) -{ - XMStoreFloat3SE(this, XMLoadFloat3(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMHALF4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMHALF4::XMHALF4 -( - float _x, - float _y, - float _z, - float _w -) -{ - x = XMConvertFloatToHalf(_x); - y = XMConvertFloatToHalf(_y); - z = XMConvertFloatToHalf(_z); - w = XMConvertFloatToHalf(_w); -} - -//------------------------------------------------------------------------------ - -_Use_decl_annotations_ -inline PackedVector::XMHALF4::XMHALF4 -( - const float* pArray -) -{ - XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(float), 4); -} - -/**************************************************************************** - * - * XMSHORTN4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMSHORTN4::XMSHORTN4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMSHORTN4::XMSHORTN4 -( - const float* pArray -) -{ - XMStoreShortN4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMSHORT4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMSHORT4::XMSHORT4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMSHORT4::XMSHORT4 -( - const float* pArray -) -{ - XMStoreShort4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMUSHORTN4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMUSHORTN4::XMUSHORTN4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMUSHORTN4::XMUSHORTN4 -( - const float* pArray -) -{ - XMStoreUShortN4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMUSHORT4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMUSHORT4::XMUSHORT4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMUSHORT4::XMUSHORT4 -( - const float* pArray -) -{ - XMStoreUShort4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMXDECN4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMXDECN4::XMXDECN4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMXDECN4::XMXDECN4 -( - const float* pArray -) -{ - XMStoreXDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMXDEC4 operators - * - ****************************************************************************/ - -#pragma warning(push) -#pragma warning(disable : 4996) -// C4996: ignore deprecation warning - -//------------------------------------------------------------------------------ - -inline PackedVector::XMXDEC4::XMXDEC4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMXDEC4::XMXDEC4 -( - const float* pArray -) -{ - XMStoreXDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMDECN4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMDECN4::XMDECN4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMDECN4::XMDECN4 -( - const float* pArray -) -{ - XMStoreDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMDEC4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMDEC4::XMDEC4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMDEC4::XMDEC4 -( - const float* pArray -) -{ - XMStoreDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -#pragma warning(pop) - -/**************************************************************************** - * - * XMUDECN4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMUDECN4::XMUDECN4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMUDECN4::XMUDECN4 -( - const float* pArray -) -{ - XMStoreUDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMUDEC4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMUDEC4::XMUDEC4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMUDEC4::XMUDEC4 -( - const float* pArray -) -{ - XMStoreUDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMBYTEN4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMBYTEN4::XMBYTEN4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMBYTEN4::XMBYTEN4 -( - const float* pArray -) -{ - XMStoreByteN4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMBYTE4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMBYTE4::XMBYTE4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMBYTE4::XMBYTE4 -( - const float* pArray -) -{ - XMStoreByte4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMUBYTEN4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMUBYTEN4::XMUBYTEN4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMUBYTEN4::XMUBYTEN4 -( - const float* pArray -) -{ - XMStoreUByteN4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMUBYTE4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMUBYTE4::XMUBYTE4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w)); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMUBYTE4::XMUBYTE4 -( - const float* pArray -) -{ - XMStoreUByte4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMUNIBBLE4 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMUNIBBLE4::XMUNIBBLE4 -( - float _x, - float _y, - float _z, - float _w -) -{ - XMStoreUNibble4(this, XMVectorSet( _x, _y, _z, _w )); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMUNIBBLE4::XMUNIBBLE4 -( - const float *pArray -) -{ - XMStoreUNibble4(this, XMLoadFloat4(reinterpret_cast(pArray))); -} - -/**************************************************************************** - * - * XMU555 operators - * - ****************************************************************************/ - -//------------------------------------------------------------------------------ - -inline PackedVector::XMU555::XMU555 -( - float _x, - float _y, - float _z, - bool _w -) -{ - XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f) )); -} - -//------------------------------------------------------------------------------ -_Use_decl_annotations_ -inline PackedVector::XMU555::XMU555 -( - const float *pArray, - bool _w -) -{ - XMVECTOR V = XMLoadFloat3(reinterpret_cast(pArray)); - XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f) )); -} - - +//------------------------------------------------------------------------------------- +// DirectXPackedVector.inl -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615560 +//------------------------------------------------------------------------------------- + +#pragma once + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline float PackedVector::XMConvertHalfToFloat +( + HALF Value +) +{ +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V1 = _mm_cvtsi32_si128( static_cast(Value) ); + __m128 V2 = _mm_cvtph_ps( V1 ); + return _mm_cvtss_f32( V2 ); +#else + uint32_t Mantissa = (uint32_t)(Value & 0x03FF); + + uint32_t Exponent = (Value & 0x7C00); + if ( Exponent == 0x7C00 ) // INF/NAN + { + Exponent = (uint32_t)0x8f; + } + else if (Exponent != 0) // The value is normalized + { + Exponent = (uint32_t)((Value >> 10) & 0x1F); + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x0400) == 0); + + Mantissa &= 0x03FF; + } + else // The value is zero + { + Exponent = (uint32_t)-112; + } + + uint32_t Result = ((Value & 0x8000) << 16) | // Sign + ((Exponent + 112) << 23) | // Exponent + (Mantissa << 13); // Mantissa + + return reinterpret_cast(&Result)[0]; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +#pragma prefast(push) +#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" ) + +_Use_decl_annotations_ +inline float* PackedVector::XMConvertHalfToFloatStream +( + float* pOutputStream, + size_t OutputStride, + const HALF* pInputStream, + size_t InputStride, + size_t HalfCount +) +{ + assert(pOutputStream); + assert(pInputStream); + + assert(InputStride >= sizeof(HALF)); + _Analysis_assume_(InputStride >= sizeof(HALF)); + + assert(OutputStride >= sizeof(float)); + _Analysis_assume_(OutputStride >= sizeof(float)); + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + const uint8_t* pHalf = reinterpret_cast(pInputStream); + uint8_t* pFloat = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = HalfCount >> 2; + if ( four > 0 ) + { + if (InputStride == sizeof(HALF)) + { + if (OutputStride == sizeof(float)) + { + if ( ((uintptr_t)pFloat & 0xF) == 0) + { + // Packed input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); + pHalf += InputStride*4; + + __m128 FV = _mm_cvtph_ps( HV ); + + XM_STREAM_PS( reinterpret_cast(pFloat), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); + pHalf += InputStride*4; + + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_storeu_ps( reinterpret_cast(pFloat), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128i HV = _mm_loadl_epi64( reinterpret_cast(pHalf) ); + pHalf += InputStride*4; + + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_store_ss( reinterpret_cast(pFloat), FV ); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 1 ); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 2 ); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps( FV, 3 ); + pFloat += OutputStride; + i += 4; + } + } + } + else if (OutputStride == sizeof(float)) + { + if ( ((uintptr_t)pFloat & 0xF) == 0) + { + // Scattered input, aligned & packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16( HV, H1, 0 ); + HV = _mm_insert_epi16( HV, H2, 1 ); + HV = _mm_insert_epi16( HV, H3, 2 ); + HV = _mm_insert_epi16( HV, H4, 3 ); + __m128 FV = _mm_cvtph_ps( HV ); + + XM_STREAM_PS( reinterpret_cast(pFloat ), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + else + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16( HV, H1, 0 ); + HV = _mm_insert_epi16( HV, H2, 1 ); + HV = _mm_insert_epi16( HV, H3, 2 ); + HV = _mm_insert_epi16( HV, H4, 3 ); + __m128 FV = _mm_cvtph_ps( HV ); + + _mm_storeu_ps( reinterpret_cast(pFloat ), FV ); + pFloat += OutputStride*4; + i += 4; + } + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + uint16_t H1 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H2 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H3 = *reinterpret_cast(pHalf); + pHalf += InputStride; + uint16_t H4 = *reinterpret_cast(pHalf); + pHalf += InputStride; + + __m128i HV = _mm_setzero_si128(); + HV = _mm_insert_epi16(HV, H1, 0); + HV = _mm_insert_epi16(HV, H2, 1); + HV = _mm_insert_epi16(HV, H3, 2); + HV = _mm_insert_epi16(HV, H4, 3); + __m128 FV = _mm_cvtph_ps(HV); + + _mm_store_ss(reinterpret_cast(pFloat), FV); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 1); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 2); + pFloat += OutputStride; + *reinterpret_cast(pFloat) = _mm_extract_ps(FV, 3); + pFloat += OutputStride; + i += 4; + } + } + } + + for (; i < HalfCount; ++i) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + XM_SFENCE(); + + return pOutputStream; +#else + const uint8_t* pHalf = reinterpret_cast(pInputStream); + uint8_t* pFloat = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < HalfCount; i++) + { + *reinterpret_cast(pFloat) = XMConvertHalfToFloat(reinterpret_cast(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline PackedVector::HALF PackedVector::XMConvertFloatToHalf +( + float Value +) +{ +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128 V1 = _mm_set_ss( Value ); + __m128i V2 = _mm_cvtps_ph( V1, 0 ); + return static_cast( _mm_cvtsi128_si32(V2) ); +#else + uint32_t Result; + + uint32_t IValue = reinterpret_cast(&Value)[0]; + uint32_t Sign = (IValue & 0x80000000U) >> 16U; + IValue = IValue & 0x7FFFFFFFU; // Hack off the sign + + if (IValue > 0x477FE000U) + { + // The number is too large to be represented as a half. Saturate to infinity. + if (((IValue & 0x7F800000) == 0x7F800000) && ((IValue & 0x7FFFFF ) != 0)) + { + Result = 0x7FFF; // NAN + } + else + { + Result = 0x7C00U; // INF + } + } + else + { + if (IValue < 0x38800000U) + { + // The number is too small to be represented as a normalized half. + // Convert it to a denormalized value. + uint32_t Shift = 113U - (IValue >> 23U); + IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized half. + IValue += 0xC8000000U; + } + + Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU; + } + return (HALF)(Result|Sign); +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream +( + HALF* pOutputStream, + size_t OutputStride, + const float* pInputStream, + size_t InputStride, + size_t FloatCount +) +{ + assert(pOutputStream); + assert(pInputStream); + + assert(InputStride >= sizeof(float)); + _Analysis_assume_(InputStride >= sizeof(float)); + + assert(OutputStride >= sizeof(HALF)); + _Analysis_assume_(OutputStride >= sizeof(HALF)); + +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + const uint8_t* pFloat = reinterpret_cast(pInputStream); + uint8_t* pHalf = reinterpret_cast(pOutputStream); + + size_t i = 0; + size_t four = FloatCount >> 2; + if (four > 0) + { + if (InputStride == sizeof(float)) + { + if (OutputStride == sizeof(HALF)) + { + if ( ((uintptr_t)pFloat & 0xF) == 0) + { + // Aligned and packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); + pHalf += OutputStride*4; + i += 4; + } + } + else + { + // Packed input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); + pHalf += OutputStride*4; + i += 4; + } + } + } + else + { + if ( ((uintptr_t)pFloat & 0xF) == 0) + { + // Aligned & packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_load_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); + pHalf += OutputStride; + i += 4; + } + } + else + { + // Packed input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV = _mm_loadu_ps( reinterpret_cast(pFloat) ); + pFloat += InputStride*4; + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 0 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 1 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 2 ) ); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast( _mm_extract_epi16( HV, 3 ) ); + pHalf += OutputStride; + i += 4; + } + } + } + } + else if (OutputStride == sizeof(HALF)) + { + // Scattered input, packed output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss( reinterpret_cast(pFloat) ); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 ); + __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 ); + FV = _mm_blend_ps( FV, FT, 0xC ); + + __m128i HV = _mm_cvtps_ph( FV, 0 ); + + _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV ); + pHalf += OutputStride*4; + i += 4; + } + } + else + { + // Scattered input, scattered output + for (size_t j = 0; j < four; ++j) + { + __m128 FV1 = _mm_load_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV2 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV3 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV4 = _mm_broadcast_ss(reinterpret_cast(pFloat)); + pFloat += InputStride; + + __m128 FV = _mm_blend_ps(FV1, FV2, 0x2); + __m128 FT = _mm_blend_ps(FV3, FV4, 0x8); + FV = _mm_blend_ps(FV, FT, 0xC); + + __m128i HV = _mm_cvtps_ph(FV, 0); + + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 0)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 1)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 2)); + pHalf += OutputStride; + *reinterpret_cast(pHalf) = static_cast(_mm_extract_epi16(HV, 3)); + pHalf += OutputStride; + i += 4; + } + } + } + + for (; i < FloatCount; ++i) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + + return pOutputStream; +#else + const uint8_t* pFloat = reinterpret_cast(pInputStream); + uint8_t* pHalf = reinterpret_cast(pOutputStream); + + for (size_t i = 0; i < FloatCount; i++) + { + *reinterpret_cast(pHalf) = XMConvertFloatToHalf(reinterpret_cast(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + return pOutputStream; +#endif // !_XM_F16C_INTRINSICS_ +} + +#pragma prefast(pop) + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ +#pragma prefast(push) +#pragma prefast(disable:28931, "PREfast noise: Esp:1266") + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadColor +( + const XMCOLOR* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + // int32_t -> Float conversions are done in one instruction. + // uint32_t -> Float calls a runtime function. Keep in int32_t + int32_t iColor = (int32_t)(pSource->c); + XMVECTORF32 vColor = { + (float)((iColor >> 16) & 0xFF) * (1.0f/255.0f), + (float)((iColor >> 8) & 0xFF) * (1.0f/255.0f), + (float)(iColor & 0xFF) * (1.0f/255.0f), + (float)((iColor >> 24) & 0xFF) * (1.0f/255.0f) + }; + return vColor.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32_t bgra = pSource->c; + uint32_t rgba = (bgra & 0xFF00FF00) | ((bgra >> 16) & 0xFF) | ((bgra << 16) & 0xFF0000); + uint32x2_t vInt8 = vdup_n_u32(rgba); + uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) ); + uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_n_f32( R, 1.0f/255.0f ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + __m128i vInt = _mm_set1_epi32(pSource->c); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8); + // a is unsigned! Flip the bit to convert the order to signed + vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8); + // Convert to floating point numbers + XMVECTOR vTemp = _mm_cvtepi32_ps(vInt); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8); + // Convert 0-255 to 0.0f-1.0f + return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf2 +( + const XMHALF2* pSource +) +{ + assert(pSource); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128 V = _mm_load_ss( reinterpret_cast(pSource) ); + return _mm_cvtph_ps( _mm_castps_si128( V ) ); +#else + XMVECTORF32 vResult = { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + 0.0f, + 0.0f + }; + return vResult.v; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShortN2 +( + const XMSHORTN2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (pSource->x == -32768) ? -1.f : ((float)pSource->x * (1.0f/32767.0f)), + (pSource->y == -32768) ? -1.f : ((float)pSource->y * (1.0f/32767.0f)), + 0.0f, + 0.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast( pSource ) ); + int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) ); + vInt = vandq_s32( vInt, g_XMMaskXY ); + float32x4_t R = vcvtq_f32_s32(vInt); + R = vmulq_n_f32( R, 1.0f/32767.0f ); + return vmaxq_f32( R, vdupq_n_f32(-1.f) ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // x needs to be sign extended + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x - 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16); + // Convert -1.0f - 1.0f + vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16); + // Clamp result (for case of -32768) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShort2 +( + const XMSHORT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + 0.f, + 0.f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast( pSource ) ); + int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) ); + vInt = vandq_s32( vInt, g_XMMaskXY ); + return vcvtq_f32_s32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // x needs to be sign extended + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x - 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16); + // Y is 65536 too large + return _mm_mul_ps(vTemp,g_XMFixupY16); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShortN2 +( + const XMUSHORTN2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x / 65535.0f, + (float)pSource->y / 65535.0f, + 0.f, + 0.f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast( pSource ) ); + uint32x4_t vInt = vmovl_u16( vreinterpret_u16_u32(vInt16) ); + vInt = vandq_u32( vInt, g_XMMaskXY ); + float32x4_t R = vcvtq_f32_u32(vInt); + R = vmulq_n_f32( R, 1.0f/65535.0f ); + return vmaxq_f32( R, vdupq_n_f32(-1.f) ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f}; + static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0}; + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // y needs to be sign flipped + vTemp = _mm_xor_ps(vTemp,g_XMFlipY); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // y + 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,FixaddY16); + // Y is 65536 times too large + vTemp = _mm_mul_ps(vTemp,FixupY16); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShort2 +( + const XMUSHORT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + 0.f, + 0.f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast( pSource ) ); + uint32x4_t vInt = vmovl_u16( vreinterpret_u16_u32(vInt16) ); + vInt = vandq_u32( vInt, g_XMMaskXY ); + return vcvtq_f32_u32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0}; + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // y needs to be sign flipped + vTemp = _mm_xor_ps(vTemp,g_XMFlipY); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // Y is 65536 times too large + vTemp = _mm_mul_ps(vTemp,g_XMFixupY16); + // y + 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,FixaddY16); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByteN2 +( + const XMBYTEN2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (pSource->x == -128) ? -1.f : ((float)pSource->x * (1.0f/127.0f)), + (pSource->y == -128) ? -1.f : ((float)pSource->y * (1.0f/127.0f)), + 0.0f, + 0.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast( pSource ) ); + int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u16(vInt8) ); + int32x4_t vInt = vmovl_s16( vget_low_s16( vInt16 ) ); + vInt = vandq_s32( vInt, g_XMMaskXY ); + float32x4_t R = vcvtq_f32_s32(vInt); + R = vmulq_n_f32( R, 1.0f/127.0f ); + return vmaxq_f32( R, vdupq_n_f32(-1.f) ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {1.0f/127.0f,1.0f/(127.0f*256.0f),0,0}; + static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0}; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask + vTemp = _mm_and_ps(vTemp,Mask); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,Scale); + // Clamp result (for case of -128) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByte2 +( + const XMBYTE2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + 0.0f, + 0.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast( pSource ) ); + int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u16(vInt8) ); + int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) ); + vInt = vandq_s32( vInt, g_XMMaskXY ); + return vcvtq_f32_s32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)}; + static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0}; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask + vTemp = _mm_and_ps(vTemp,Mask); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddByte4); + // Fix y, z and w because they are too large + return _mm_mul_ps(vTemp,Scale); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByteN2 +( + const XMUBYTEN2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x * (1.0f/255.0f), + (float)pSource->y * (1.0f/255.0f), + 0.0f, + 0.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast( pSource ) ); + uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u16(vInt8) ); + uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); + vInt = vandq_u32( vInt, g_XMMaskXY ); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_n_f32( R, 1.0f/255.0f ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {1.0f/255.0f,1.0f/(255.0f*256.0f),0,0}; + static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0}; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask + vTemp = _mm_and_ps(vTemp,Mask); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Fix y, z and w because they are too large + return _mm_mul_ps(vTemp,Scale); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByte2 +( + const XMUBYTE2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + 0.0f, + 0.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast( pSource ) ); + uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) ); + uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); + vInt = vandq_s32( vInt, g_XMMaskXY ); + return vcvtq_f32_u32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {1.0f,1.0f/256.0f,0,0}; + static const XMVECTORU32 Mask = {0xFF,0xFF00,0,0}; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask + vTemp = _mm_and_ps(vTemp,Mask); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Fix y, z and w because they are too large + return _mm_mul_ps(vTemp,Scale); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadU565 +( + const XMU565* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + float(pSource->v & 0x1F), + float((pSource->v >> 5) & 0x3F), + float((pSource->v >> 11) & 0x1F), + 0.f, + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0}; + static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0}; + uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast( pSource ) ); + uint32x4_t vInt = vmovl_u16( vInt16 ); + vInt = vandq_u32(vInt,U565And); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R,U565Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0}; + static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0}; + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,U565And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult,U565Mul); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadFloat3PK +( + const XMFLOAT3PK* pSource +) +{ + assert(pSource); + + __declspec(align(16)) uint32_t Result[4]; + uint32_t Mantissa; + uint32_t Exponent; + + // X Channel (6-bit mantissa) + Mantissa = pSource->xm; + + if ( pSource->xe == 0x1f ) // INF or NAN + { + Result[0] = 0x7f800000 | (pSource->xm << 17); + } + else + { + if ( pSource->xe != 0 ) // The value is normalized + { + Exponent = pSource->xe; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x40) == 0); + + Mantissa &= 0x3F; + } + else // The value is zero + { + Exponent = (uint32_t)-112; + } + + Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17); + } + + // Y Channel (6-bit mantissa) + Mantissa = pSource->ym; + + if ( pSource->ye == 0x1f ) // INF or NAN + { + Result[1] = 0x7f800000 | (pSource->ym << 17); + } + else + { + if ( pSource->ye != 0 ) // The value is normalized + { + Exponent = pSource->ye; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x40) == 0); + + Mantissa &= 0x3F; + } + else // The value is zero + { + Exponent = (uint32_t)-112; + } + + Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17); + } + + // Z Channel (5-bit mantissa) + Mantissa = pSource->zm; + + if ( pSource->ze == 0x1f ) // INF or NAN + { + Result[2] = 0x7f800000 | (pSource->zm << 17); + } + else + { + if ( pSource->ze != 0 ) // The value is normalized + { + Exponent = pSource->ze; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x20) == 0); + + Mantissa &= 0x1F; + } + else // The value is zero + { + Exponent = (uint32_t)-112; + } + + Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18); + } + + return XMLoadFloat3A( reinterpret_cast(&Result) ); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadFloat3SE +( + const XMFLOAT3SE* pSource +) +{ + assert(pSource); + + union { float f; int32_t i; } fi; + fi.i = 0x33800000 + (pSource->e << 23); + float Scale = fi.f; + + XMVECTORF32 v = { + Scale * float( pSource->xm ), + Scale * float( pSource->ym ), + Scale * float( pSource->zm ), + 1.0f }; + return v; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadHalf4 +( + const XMHALF4* pSource +) +{ + assert(pSource); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V = _mm_loadl_epi64( reinterpret_cast(pSource) ); + return _mm_cvtph_ps( V ); +#else + XMVECTORF32 vResult = { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + XMConvertHalfToFloat(pSource->z), + XMConvertHalfToFloat(pSource->w) + }; + return vResult.v; +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShortN4 +( + const XMSHORTN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (pSource->x == -32768) ? -1.f : ((float)pSource->x * (1.0f/32767.0f)), + (pSource->y == -32768) ? -1.f : ((float)pSource->y * (1.0f/32767.0f)), + (pSource->z == -32768) ? -1.f : ((float)pSource->z * (1.0f/32767.0f)), + (pSource->w == -32768) ? -1.f : ((float)pSource->w * (1.0f/32767.0f)) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int16x4_t vInt = vld1_s16( (const int16_t*)pSource ); + int32x4_t V = vmovl_s16( vInt ); + V = vcvtq_f32_s32( V ); + V = vmulq_n_f32( V, 1.0f/32767.0f ); + return vmaxq_f32( V, vdupq_n_f32(-1.f) ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x and z - 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16); + // Convert to -1.0f - 1.0f + vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); + // Clamp result (for case of -32768) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadShort4 +( + const XMSHORT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + (float)pSource->z, + (float)pSource->w + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int16x4_t vInt = vld1_s16( (const int16_t*)pSource ); + int32x4_t V = vmovl_s16( vInt ); + return vcvtq_f32_s32( V ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x and z - 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShortN4 +( + const XMUSHORTN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x / 65535.0f, + (float)pSource->y / 65535.0f, + (float)pSource->z / 65535.0f, + (float)pSource->w / 65535.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt = vld1_u16( (const uint16_t*)pSource ); + uint32x4_t V = vmovl_u16( vInt ); + V = vcvtq_f32_u32( V ); + return vmulq_n_f32( V, 1.0f/65535.0f ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f*65536.0f),1.0f/(65535.0f*65536.0f)}; + static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f*65536.0f,32768.0f*65536.0f}; + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); + // y and w are signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipZW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // y and w + 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,FixaddY16W16); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp,FixupY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUShort4 +( + const XMUSHORT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + (float)pSource->z, + (float)pSource->w + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint16x4_t vInt = vld1_u16( (const uint16_t*)pSource ); + uint32x4_t V = vmovl_u16( vInt ); + return vcvtq_f32_u32( V ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f,32768.0f}; + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); + // y and w are signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipZW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16); + // y and w + 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,FixaddY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadXDecN4 +( + const XMXDECN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + (ElementX == 0x200) ? -1.f : ((float)(int16_t)(ElementX | SignExtend[ElementX >> 9]) / 511.0f), + (ElementY == 0x200) ? -1.f : ((float)(int16_t)(ElementY | SignExtend[ElementY >> 9]) / 511.0f), + (ElementZ == 0x200) ? -1.f : ((float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]) / 511.0f), + (float)(pSource->v >> 30) / 3.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskA2B10G10R10); + vInt = veorq_u32(vInt,g_XMFlipA2B10G10R10); + float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) ); + R = vaddq_f32(R,g_XMFixAA2B10G10R10); + R = vmulq_f32(R,g_XMNormalizeA2B10G10R10); + return vmaxq_f32( R, vdupq_n_f32(-1.0f) ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + __m128 vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10); + // Clamp result (for case of -512) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#endif +} + +//------------------------------------------------------------------------------ +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadXDec4 +( + const XMXDEC4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + (float)(int16_t)(ElementX | SignExtend[ElementX >> 9]), + (float)(int16_t)(ElementY | SignExtend[ElementY >> 9]), + (float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]), + (float)(pSource->v >> 30) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORU32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000}; + static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f}; + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskDec4); + vInt = veorq_u32(vInt,XDec4Xor); + float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) ); + R = vaddq_f32(R ,XDec4Add); + return vmulq_f32(R,g_XMMulDec4); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORU32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000}; + static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f}; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,XDec4Xor); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,XDec4Add); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); + return vTemp; +#endif +} + +#pragma warning(pop) + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUDecN4 +( + const XMUDECN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + (float)ElementX / 1023.0f, + (float)ElementY / 1023.0f, + (float)ElementZ / 1023.0f, + (float)(pSource->v >> 30) / 3.0f + }; + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)}; + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskDec4); + float32x4_t R = vcvtq_f32_u32( vInt ); + return vmulq_f32(R,UDecN4Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)}; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,UDecN4Mul); + return vTemp; +#endif +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUDecN4_XR +( + const XMUDECN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + int32_t ElementX = pSource->v & 0x3FF; + int32_t ElementY = (pSource->v >> 10) & 0x3FF; + int32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + (float)(ElementX - 0x180) / 510.0f, + (float)(ElementY - 0x180) / 510.0f, + (float)(ElementZ - 0x180) / 510.0f, + (float)(pSource->v >> 30) / 3.0f + }; + + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 XRMul = {1.0f/510.0f,1.0f/(510.0f*1024.0f),1.0f/(510.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)}; + static const XMVECTORI32 XRBias = { 0x180, 0x180*1024, 0x180*1024*1024, 0 }; + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskDec4); + int32x4_t vTemp = vsubq_s32( vreinterpretq_s32_u32(vInt), XRBias ); + vTemp = veorq_u32( vTemp, g_XMFlipW ); + float32x4_t R = vcvtq_f32_s32( vTemp ); + R = vaddq_f32(R,g_XMAddUDec4); + return vmulq_f32(R,XRMul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 XRMul = {1.0f/510.0f,1.0f/(510.0f*1024.0f),1.0f/(510.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)}; + static const XMVECTORI32 XRBias = { 0x180, 0x180*1024, 0x180*1024*1024, 0 }; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask channels + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // Subtract bias + vTemp = _mm_castsi128_ps( _mm_sub_epi32( _mm_castps_si128(vTemp), XRBias ) ); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Convert to 0.0f-1.0f + return _mm_mul_ps(vTemp,XRMul); +#endif +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUDec4 +( + const XMUDEC4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + (float)ElementX, + (float)ElementY, + (float)ElementZ, + (float)(pSource->v >> 30) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskDec4); + float32x4_t R = vcvtq_f32_u32( vInt ); + return vmulq_f32(R,g_XMMulDec4); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning + +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadDecN4 +( + const XMDECN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; + static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC}; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + uint32_t ElementW = pSource->v >> 30; + + XMVECTORF32 vResult = { + (ElementX == 0x200) ? -1.f : ((float)(int16_t)(ElementX | SignExtend[ElementX >> 9]) / 511.0f), + (ElementY == 0x200) ? -1.f : ((float)(int16_t)(ElementY | SignExtend[ElementY >> 9]) / 511.0f), + (ElementZ == 0x200) ? -1.f : ((float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]) / 511.0f), + (ElementW == 0x2) ? -1.f : ((float)(int16_t)(ElementW | SignExtendW[(ElementW >> 1) & 1])) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)}; + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskDec4); + vInt = veorq_u32(vInt,g_XMXorDec4); + float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) ); + R = vaddq_f32(R,g_XMAddDec4); + R = vmulq_f32(R,DecN4Mul); + return vmaxq_f32( R, vdupq_n_f32(-1.0f) ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)}; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorDec4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,DecN4Mul); + // Clamp result (for case of -512/-1) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadDec4 +( + const XMDEC4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; + static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC}; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + uint32_t ElementW = pSource->v >> 30; + + XMVECTORF32 vResult = { + (float)(int16_t)(ElementX | SignExtend[ElementX >> 9]), + (float)(int16_t)(ElementY | SignExtend[ElementY >> 9]), + (float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]), + (float)(int16_t)(ElementW | SignExtendW[ElementW >> 1]) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast( pSource ) ); + vInt = vandq_u32(vInt,g_XMMaskDec4); + vInt = veorq_u32(vInt,g_XMXorDec4); + float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) ); + R = vaddq_f32(R,g_XMAddDec4); + return vmulq_f32(R,g_XMMulDec4); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorDec4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); + return vTemp; +#endif +} + +#pragma warning(pop) + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByteN4 +( + const XMUBYTEN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x / 255.0f, + (float)pSource->y / 255.0f, + (float)pSource->z / 255.0f, + (float)pSource->w / 255.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast( pSource ) ); + uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) ); + uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_n_f32( R, 1.0f/255.0f ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f*256.0f),1.0f/(255.0f*65536.0f),1.0f/(255.0f*65536.0f*256.0f)}; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUByte4 +( + const XMUBYTE4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + (float)pSource->z, + (float)pSource->w + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast( pSource ) ); + uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) ); + uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) ); + return vcvtq_f32_u32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)}; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByteN4 +( + const XMBYTEN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (pSource->x == -128) ? -1.f : ((float)pSource->x / 127.0f), + (pSource->y == -128) ? -1.f : ((float)pSource->y / 127.0f), + (pSource->z == -128) ? -1.f : ((float)pSource->z / 127.0f), + (pSource->w == -128) ? -1.f : ((float)pSource->w / 127.0f) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast( pSource ) ); + int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u32(vInt8) ); + int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) ); + float32x4_t R = vcvtq_f32_s32(vInt); + R = vmulq_n_f32( R, 1.0f/127.0f ); + return vmaxq_f32( R, vdupq_n_f32(-1.f) ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f*256.0f),1.0f/(127.0f*65536.0f),1.0f/(127.0f*65536.0f*256.0f)}; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul); + // Clamp result (for case of -128) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadByte4 +( + const XMBYTE4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + (float)pSource->z, + (float)pSource->w + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast( pSource ) ); + int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u32(vInt8) ); + int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) ); + return vcvtq_f32_s32(vInt); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)}; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadByte4Mul); + return vTemp; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadUNibble4 +( + const XMUNIBBLE4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + float(pSource->v & 0xF), + float((pSource->v >> 4) & 0xF), + float((pSource->v >> 8) & 0xF), + float((pSource->v >> 12) & 0xF) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000}; + static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f}; + uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast( pSource ) ); + uint32x4_t vInt = vmovl_u16( vInt16 ); + vInt = vandq_u32(vInt,UNibble4And); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R,UNibble4Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000}; + static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f}; + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,UNibble4And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult,UNibble4Mul); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XM_CALLCONV PackedVector::XMLoadU555 +( + const XMU555* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + float(pSource->v & 0x1F), + float((pSource->v >> 5) & 0x1F), + float((pSource->v >> 10) & 0x1F), + float((pSource->v >> 15) & 0x1) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000}; + static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f}; + uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast( pSource ) ); + uint32x4_t vInt = vmovl_u16( vInt16 ); + vInt = vandq_u32(vInt,U555And); + float32x4_t R = vcvtq_f32_u32(vInt); + return vmulq_f32(R,U555Mul); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000}; + static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f}; + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,U555And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult,U555Mul); + return vResult; +#endif +} + +#pragma prefast(pop) + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreColor +( + XMCOLOR* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, g_UByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->c = ((uint32_t)tmp.w << 24) | + ((uint32_t)tmp.x << 16) | + ((uint32_t)tmp.y << 8) | + ((uint32_t)tmp.z); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 255.0f ); + R = XMVectorRound(R); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); + uint32_t rgba = vget_lane_u32( vreinterpret_u32_u8(vInt8), 0 ); + pDestination->c = (rgba & 0xFF00FF00) | ((rgba >> 16) & 0xFF) | ((rgba << 16) & 0xFF0000); +#elif defined(_XM_SSE_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Set>1 to 1 + vResult = _mm_min_ps(vResult,g_XMOne); + // Convert to 0-255 + vResult = _mm_mul_ps(vResult,g_UByteMax); + // Shuffle RGBA to ARGB + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + // Convert to int + __m128i vInt = _mm_cvtps_epi32(vResult); + // Mash to shorts + vInt = _mm_packs_epi32(vInt,vInt); + // Mash to bytes + vInt = _mm_packus_epi16(vInt,vInt); + // Store the color + _mm_store_ss(reinterpret_cast(&pDestination->c),_mm_castsi128_ps(vInt)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreHalf2 +( + XMHALF2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V1 = _mm_cvtps_ph( V, 0 ); + _mm_store_ss( reinterpret_cast(pDestination), _mm_castsi128_ps(V1) ); +#else + pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V)); + pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V)); +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreShortN2 +( + XMSHORTN2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 32767.0f ); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32( vInt32 ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_s16(vInt16), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,g_ShortMax); + __m128i vResulti = _mm_cvtps_epi32(vResult); + vResulti = _mm_packs_epi32(vResulti,vResulti); + _mm_store_ss(reinterpret_cast(&pDestination->x),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreShort2 +( + XMSHORT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-32767.f) ); + R = vminq_f32(R, vdupq_n_f32(32767.0f)); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32( vInt32 ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_s16(vInt16), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_ShortMin); + vResult = _mm_min_ps(vResult,g_ShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Pack the ints into shorts + vInt = _mm_packs_epi32(vInt,vInt); + _mm_store_ss(reinterpret_cast(&pDestination->x),_mm_castsi128_ps(vInt)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreUShortN2 +( + XMUSHORTN2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 65535.0f ); + R = vaddq_f32( R, g_XMOneHalf ); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_u16(vInt16), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,g_UShortMax); + vResult = _mm_add_ps(vResult,g_XMOneHalf); + // Convert to int + __m128i vInt = _mm_cvttps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreUShort2 +( + XMUSHORT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) ); + R = vminq_f32(R, vdupq_n_f32(65535.0f)); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_u16(vInt16), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_UShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreByteN2 +( + XMBYTEN2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (int8_t)tmp.x; + pDestination->y = (int8_t)tmp.y; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 127.0f ); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32( vInt32 ); + int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) ); + vst1_lane_u16( reinterpret_cast( pDestination ), vreinterpret_u16_s8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,g_ByteMax); + // Convert to int by rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); + uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreByte2 +( + XMBYTE2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (int8_t)tmp.x; + pDestination->y = (int8_t)tmp.y; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f) ); + R = vminq_f32(R, vdupq_n_f32(127.0f)); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32( vInt32 ); + int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) ); + vst1_lane_u16( reinterpret_cast( pDestination ), vreinterpret_u16_s8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_ByteMin); + vResult = _mm_min_ps(vResult,g_ByteMax); + // Convert to int by rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); + uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreUByteN2 +( + XMUBYTEN2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, g_UByteMax, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (uint8_t)tmp.x; + pDestination->y = (uint8_t)tmp.y; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 255.0f ); + R = vaddq_f32( R, g_XMOneHalf ); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); + vst1_lane_u16( reinterpret_cast( pDestination ), vreinterpret_u16_u8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,g_UByteMax); + vResult = _mm_add_ps(vResult,g_XMOneHalf); + // Convert to int + __m128i vInt = _mm_cvttps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); + uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreUByte2 +( + XMUBYTE2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (uint8_t)tmp.x; + pDestination->y = (uint8_t)tmp.y; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) ); + R = vminq_f32(R, vdupq_n_f32(255.0f)); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); + vst1_lane_u16( reinterpret_cast( pDestination ), vreinterpret_u16_u8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_UByteMax); + // Convert to int by rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); + uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreU565 +( + XMU565* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + static const XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f}; + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->v = (((uint16_t)tmp.z & 0x1F) << 11) | + (((uint16_t)tmp.y & 0x3F) << 5) | + (((uint16_t)tmp.x & 0x1F)); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f}; + static const XMVECTORF32 Scale = {1.0f,32.f,32.f*64.f, 0.f }; + static const XMVECTORU32 Mask = {0x1F,0x3F<<5,0x1F<<11,0}; + float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0)); + vResult = vminq_f32(vResult,Max); + vResult = vmulq_f32(vResult,Scale); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti,Mask); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vhi = vget_high_u32(vResulti); + vTemp = vorr_u32( vTemp, vhi ); + vTemp = vpadd_u32( vTemp, vTemp ); + vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); + uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); + uint16_t z = static_cast(_mm_extract_epi16(vInt,4)); + pDestination->v = ((z & 0x1F) << 11) | + ((y & 0x3F) << 5) | + ((x & 0x1F)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreFloat3PK +( + XMFLOAT3PK* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + + __declspec(align(16)) uint32_t IValue[4]; + XMStoreFloat3A( reinterpret_cast(&IValue), V ); + + uint32_t Result[3]; + + // X & Y Channels (5-bit exponent, 6-bit mantissa) + for(uint32_t j=0; j < 2; ++j) + { + uint32_t Sign = IValue[j] & 0x80000000; + uint32_t I = IValue[j] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Result[j] = 0x7c0; + if (( I & 0x7FFFFF ) != 0) + { + Result[j] = 0x7c0 | (((I>>17)|(I>>11)|(I>>6)|(I))&0x3f); + } + else if ( Sign ) + { + // -INF is clamped to 0 since 3PK is positive only + Result[j] = 0; + } + } + else if ( Sign ) + { + // 3PK is positive only, so clamp to zero + Result[j] = 0; + } + else if (I > 0x477E0000U) + { + // The number is too large to be represented as a float11, set to max + Result[j] = 0x7BF; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float11 + // Convert it to a denormalized value. + uint32_t Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float11 + I += 0xC8000000U; + } + + Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU; + } + } + + // Z Channel (5-bit exponent, 5-bit mantissa) + uint32_t Sign = IValue[2] & 0x80000000; + uint32_t I = IValue[2] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Result[2] = 0x3e0; + if ( I & 0x7FFFFF ) + { + Result[2] = 0x3e0 | (((I>>18)|(I>>13)|(I>>3)|(I))&0x1f); + } + else if ( Sign ) + { + // -INF is clamped to 0 since 3PK is positive only + Result[2] = 0; + } + } + else if ( Sign ) + { + // 3PK is positive only, so clamp to zero + Result[2] = 0; + } + else if (I > 0x477C0000U) + { + // The number is too large to be represented as a float10, set to max + Result[2] = 0x3df; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float10 + // Convert it to a denormalized value. + uint32_t Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float10 + I += 0xC8000000U; + } + + Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU; + } + + // Pack Result into memory + pDestination->v = (Result[0] & 0x7ff) + | ( (Result[1] & 0x7ff) << 11 ) + | ( (Result[2] & 0x3ff) << 22 ); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreFloat3SE +( + XMFLOAT3SE* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + + XMFLOAT3A tmp; + XMStoreFloat3A( &tmp, V ); + + static const float maxf9 = float(0x1FF << 7); + static const float minf9 = float(1.f / (1 << 16)); + + float x = (tmp.x >= 0.f) ? ( (tmp.x > maxf9) ? maxf9 : tmp.x ) : 0.f; + float y = (tmp.y >= 0.f) ? ( (tmp.y > maxf9) ? maxf9 : tmp.y ) : 0.f; + float z = (tmp.z >= 0.f) ? ( (tmp.z > maxf9) ? maxf9 : tmp.z ) : 0.f; + + const float max_xy = (x > y) ? x : y; + const float max_xyz = (max_xy > z) ? max_xy : z; + + const float maxColor = (max_xyz > minf9) ? max_xyz : minf9; + + union { float f; int32_t i; } fi; + fi.f = maxColor; + fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1) + + uint32_t exp = fi.i >> 23; + pDestination->e = exp - 0x6f; + + fi.i = 0x83000000 - (exp << 23); + float ScaleR = fi.f; + +#ifdef _XM_NO_ROUNDF_ + pDestination->xm = static_cast( Internal::round_to_nearest(x * ScaleR) ); + pDestination->ym = static_cast( Internal::round_to_nearest(y * ScaleR) ); + pDestination->zm = static_cast( Internal::round_to_nearest(z * ScaleR) ); +#else + pDestination->xm = static_cast( lroundf(x * ScaleR) ); + pDestination->ym = static_cast( lroundf(y * ScaleR) ); + pDestination->zm = static_cast( lroundf(z * ScaleR) ); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreHalf4 +( + XMHALF4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __m128i V1 = _mm_cvtps_ph( V, 0 ); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 ); +#else + XMFLOAT4A t; + XMStoreFloat4A(&t, V ); + + pDestination->x = XMConvertFloatToHalf(t.x); + pDestination->y = XMConvertFloatToHalf(t.y); + pDestination->z = XMConvertFloatToHalf(t.z); + pDestination->w = XMConvertFloatToHalf(t.w); +#endif // !_XM_F16C_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreShortN4 +( + XMSHORTN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + pDestination->z = (int16_t)tmp.z; + pDestination->w = (int16_t)tmp.w; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(-1.f) ); + vResult = vminq_f32( vResult, vdupq_n_f32(1.0f) ); + vResult = vmulq_n_f32( vResult, 32767.0f ); + vResult = vcvtq_s32_f32( vResult ); + int16x4_t vInt = vmovn_s32( vResult ); + vst1_s16( reinterpret_cast(pDestination), vInt ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,g_ShortMax); + __m128i vResulti = _mm_cvtps_epi32(vResult); + vResulti = _mm_packs_epi32(vResulti,vResulti); + _mm_store_sd(reinterpret_cast(&pDestination->x),_mm_castsi128_pd(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreShort4 +( + XMSHORT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + pDestination->z = (int16_t)tmp.z; + pDestination->w = (int16_t)tmp.w; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32( V, g_ShortMin ); + vResult = vminq_f32( vResult, g_ShortMax ); + vResult = vcvtq_s32_f32( vResult ); + int16x4_t vInt = vmovn_s32( vResult ); + vst1_s16( reinterpret_cast(pDestination), vInt ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_ShortMin); + vResult = _mm_min_ps(vResult,g_ShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Pack the ints into shorts + vInt = _mm_packs_epi32(vInt,vInt); + _mm_store_sd(reinterpret_cast(&pDestination->x),_mm_castsi128_pd(vInt)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreUShortN4 +( + XMUSHORTN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + pDestination->z = (int16_t)tmp.z; + pDestination->w = (int16_t)tmp.w; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(0) ); + vResult = vminq_f32( vResult, vdupq_n_f32(1.0f) ); + vResult = vmulq_n_f32( vResult, 65535.0f ); + vResult = vaddq_f32( vResult, g_XMOneHalf ); + vResult = vcvtq_u32_f32( vResult ); + uint16x4_t vInt = vmovn_u32( vResult ); + vst1_u16( reinterpret_cast(pDestination), vInt ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,g_UShortMax); + vResult = _mm_add_ps(vResult,g_XMOneHalf); + // Convert to int + __m128i vInt = _mm_cvttps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->z = static_cast(_mm_extract_epi16(vInt,4)); + pDestination->w = static_cast(_mm_extract_epi16(vInt,6)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreUShort4 +( + XMUSHORT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + pDestination->z = (int16_t)tmp.z; + pDestination->w = (int16_t)tmp.w; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(0) ); + vResult = vminq_f32( vResult, g_UShortMax ); + vResult = vcvtq_u32_f32( vResult ); + uint16x4_t vInt = vmovn_u32( vResult ); + vst1_u16( reinterpret_cast(pDestination), vInt ); +#elif defined(_XM_SSE_INTRINSICS_) + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_UShortMax); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast(_mm_extract_epi16(vInt,2)); + pDestination->z = static_cast(_mm_extract_epi16(vInt,4)); + pDestination->w = static_cast(_mm_extract_epi16(vInt,6)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreXDecN4 +( + XMXDECN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f}; + static const XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 3.0f}; + + XMVECTOR N = XMVectorClamp(V, Min.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((uint32_t)tmp.w << 30) | + (((int32_t)tmp.z & 0x3FF) << 20) | + (((int32_t)tmp.y & 0x3FF) << 10) | + (((int32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f}; + static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f}; + static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29}; + float32x4_t vResult = vmaxq_f32(V,Min); + vResult = vminq_f32(vResult,vdupq_n_f32(1.0f)); + vResult = vmulq_f32(vResult,Scale); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti,ScaleMask); + int32x4_t vResultw = vandq_s32(vResulti,g_XMMaskW); + vResulti = vaddq_s32(vResulti,vResultw); + // Do a horizontal or of all 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti)); + uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti)); + vTemp = vorr_u32( vTemp, vhi ); + vTemp = vpadd_u32( vTemp, vTemp ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f}; + static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f}; + static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29}; + XMVECTOR vResult = _mm_max_ps(V,Min); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,Scale); + // Convert to int (W is unsigned) + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,ScaleMask); + // To fix W, add itself to shift it up to <<30 instead of <<29 + __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW); + vResulti = _mm_add_epi32(vResulti,vResultw); + // Do a horizontal or of all 4 entries + vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti),_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult)); + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult)); + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult)); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning + +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreXDec4 +( + XMXDEC4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Min = {-511.0f, -511.0f, -511.0f, 0.0f}; + static const XMVECTORF32 Max = {511.0f, 511.0f, 511.0f, 3.0f}; + + XMVECTOR N = XMVectorClamp(V, Min, Max); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((uint32_t)tmp.w << 30) | + (((int32_t)tmp.z & 0x3FF) << 20) | + (((int32_t)tmp.y & 0x3FF) << 10) | + (((int32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f}; + static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f}; + static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f}; + static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + float32x4_t vResult = vmaxq_f32(V,MinXDec4); + vResult = vminq_f32(vResult,MaxXDec4); + vResult = vmulq_f32(vResult,ScaleXDec4); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti,MaskXDec4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti)); + uint32x2_t vTemp2 = vget_high_u32(vreinterpret_u32_s32(vResulti)); + vTemp = vorr_u32( vTemp, vTemp2 ); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32( vTemp, 1 ); + vTemp2 = vadd_s32( vTemp2, vTemp2 ); + vTemp = vorr_u32( vTemp, vTemp2 ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f}; + static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f}; + static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f}; + static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,MinXDec4); + vResult = _mm_min_ps(vResult,MaxXDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleXDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskXDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a single bit left shift on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +#pragma warning(pop) + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreUDecN4 +( + XMUDECN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f}; + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, Scale.v); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((uint32_t)tmp.w << 30) | + (((uint32_t)tmp.z & 0x3FF) << 20) | + (((uint32_t)tmp.y & 0x3FF) << 10) | + (((uint32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f}; + static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0.f)); + vResult = vminq_f32(vResult,vdupq_n_f32(1.f)); + vResult = vmulq_f32(vResult,ScaleUDecN4); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti,MaskUDecN4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32( vTemp, vTemp2 ); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32( vTemp, 1 ); + vTemp2 = vadd_u32( vTemp2, vTemp2 ); + vTemp = vorr_u32( vTemp, vTemp2 ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f}; + static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUDecN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUDecN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreUDecN4_XR +( + XMUDECN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = { 510.0f, 510.0f, 510.0f, 3.0f }; + static const XMVECTORF32 Bias = { 384.0f, 384.0f, 384.0f, 0.0f }; + static const XMVECTORF32 C = { 1023.f, 1023.f, 1023.f, 3.f }; + + XMVECTOR N = XMVectorMultiplyAdd( V, Scale, Bias ); + N = XMVectorClamp( N, g_XMZero, C ); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((uint32_t)tmp.w << 30) + | (((uint32_t)tmp.z & 0x3FF) << 20) + | (((uint32_t)tmp.y & 0x3FF) << 10) + | (((uint32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Shift = {1.0f,1024.0f*0.5f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f*0.5f}; + static const XMVECTORU32 MaskUDecN4 = {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + static const XMVECTORF32 Scale = { 510.0f, 510.0f, 510.0f, 3.0f }; + static const XMVECTORF32 Bias = { 384.0f, 384.0f, 384.0f, 0.0f }; + static const XMVECTORF32 C = { 1023.f, 1023.f, 1023.f, 3.f }; + float32x4_t vResult = vmlaq_f32( Bias, V, Scale ); + vResult = vmaxq_f32(vResult,vdupq_n_f32(0.f)); + vResult = vminq_f32(vResult,C); + vResult = vmulq_f32(vResult,Shift); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti,MaskUDecN4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32( vTemp, vTemp2 ); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32( vTemp, 1 ); + vTemp2 = vadd_u32( vTemp2, vTemp2 ); + vTemp = vorr_u32( vTemp, vTemp2 ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Shift = {1.0f,1024.0f*0.5f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f*0.5f}; + static const XMVECTORU32 MaskUDecN4 = {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + static const XMVECTORF32 Scale = { 510.0f, 510.0f, 510.0f, 3.0f }; + static const XMVECTORF32 Bias = { 384.0f, 384.0f, 384.0f, 0.0f }; + static const XMVECTORF32 C = { 1023.f, 1023.f, 1023.f, 3.f }; + // Scale & bias + XMVECTOR vResult = _mm_mul_ps( V, Scale ); + vResult = _mm_add_ps( vResult, Bias ); + // Clamp to bounds + vResult = _mm_max_ps(vResult,g_XMZero); + vResult = _mm_min_ps(vResult,C); + // Scale by shift values + vResult = _mm_mul_ps(vResult,Shift); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUDecN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreUDec4 +( + XMUDEC4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Max = {1023.0f, 1023.0f, 1023.0f, 3.0f}; + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((uint32_t)tmp.w << 30) | + (((uint32_t)tmp.z & 0x3FF) << 20) | + (((uint32_t)tmp.y & 0x3FF) << 10) | + (((uint32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f}; + static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f}; + static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0.f)); + vResult = vminq_f32(vResult,MaxUDec4); + vResult = vmulq_f32(vResult,ScaleUDec4); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti,MaskUDec4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32( vTemp, vTemp2 ); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32( vTemp, 1 ); + vTemp2 = vadd_u32( vTemp2, vTemp2 ); + vTemp = vorr_u32( vTemp, vTemp2 ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f}; + static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f}; + static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,MaxUDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning + +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreDecN4 +( + XMDECN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 1.0f}; + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((int32_t)tmp.w << 30) | + (((int32_t)tmp.z & 0x3FF) << 20) | + (((int32_t)tmp.y & 0x3FF) << 10) | + (((int32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f}; + float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(-1.f)); + vResult = vminq_f32(vResult,vdupq_n_f32(1.f)); + vResult = vmulq_f32(vResult,ScaleDecN4); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti,g_XMMaskDec4); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti)); + uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti)); + vTemp = vorr_u32( vTemp, vhi ); + vTemp = vpadd_u32( vTemp, vTemp ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleDecN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,g_XMMaskDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreDec4 +( + XMDEC4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Min = {-511.0f, -511.0f, -511.0f, -1.0f}; + static const XMVECTORF32 Max = {511.0f, 511.0f, 511.0f, 1.0f}; + + XMVECTOR N = XMVectorClamp(V, Min, Max); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((int32_t)tmp.w << 30) | + (((int32_t)tmp.z & 0x3FF) << 20) | + (((int32_t)tmp.y & 0x3FF) << 10) | + (((int32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f}; + static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f}; + static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f}; + float32x4_t vResult = vmaxq_f32(V,MinDec4); + vResult = vminq_f32(vResult,MaxDec4); + vResult = vmulq_f32(vResult,ScaleDec4); + int32x4_t vResulti = vcvtq_s32_f32(vResult); + vResulti = vandq_s32(vResulti,g_XMMaskDec4); + // Do a horizontal or of all 4 entries + uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti)); + uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti)); + vTemp = vorr_u32( vTemp, vhi ); + vTemp = vpadd_u32( vTemp, vTemp ); + vst1_lane_u32( &pDestination->v, vTemp, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f}; + static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f}; + static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,MinDec4); + vResult = _mm_min_ps(vResult,MaxDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,g_XMMaskDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +#pragma warning(pop) + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreUByteN4 +( + XMUBYTEN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, g_UByteMax); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (uint8_t)tmp.x; + pDestination->y = (uint8_t)tmp.y; + pDestination->z = (uint8_t)tmp.z; + pDestination->w = (uint8_t)tmp.w; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 255.0f ); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_u8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f*256.0f*0.5f,255.0f*256.0f*256.0f,255.0f*256.0f*256.0f*256.0f*0.5f}; + static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUByteN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUByteN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a single bit left shift to fix y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreUByte4 +( + XMUBYTE4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (uint8_t)tmp.x; + pDestination->y = (uint8_t)tmp.y; + pDestination->z = (uint8_t)tmp.z; + pDestination->w = (uint8_t)tmp.w; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) ); + R = vminq_f32(R, vdupq_n_f32(255.0f)); + uint32x4_t vInt32 = vcvtq_u32_f32(R); + uint16x4_t vInt16 = vqmovn_u32( vInt32 ); + uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_u8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f*0.5f,256.0f*256.0f,256.0f*256.0f*256.0f*0.5f}; + static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_UByteMax); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUByte4); + // Convert to int by rounding + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUByte4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a single bit left shift to fix y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreByteN4 +( + XMBYTEN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(V, g_ByteMax); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (int8_t)tmp.x; + pDestination->y = (int8_t)tmp.y; + pDestination->z = (int8_t)tmp.z; + pDestination->w = (int8_t)tmp.w; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) ); + R = vminq_f32(R, vdupq_n_f32(1.0f)); + R = vmulq_n_f32( R, 127.0f ); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32( vInt32 ); + int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_s8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f*256.0f,127.0f*256.0f*256.0f,127.0f*256.0f*256.0f*256.0f}; + static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleByteN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskByteN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreByte4 +( + XMBYTE4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (int8_t)tmp.x; + pDestination->y = (int8_t)tmp.y; + pDestination->z = (int8_t)tmp.z; + pDestination->w = (int8_t)tmp.w; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f) ); + R = vminq_f32(R, vdupq_n_f32(127.f)); + int32x4_t vInt32 = vcvtq_s32_f32(R); + int16x4_t vInt16 = vqmovn_s32( vInt32 ); + int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) ); + vst1_lane_u32( &pDestination->v, vreinterpret_u32_s8(vInt8), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f*256.0f,256.0f*256.0f*256.0f}; + static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_ByteMin); + vResult = _mm_min_ps(vResult,g_ByteMax); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleByte4); + // Convert to int by rounding + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskByte4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast(&pDestination->v),_mm_castsi128_ps(vResulti)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreUNibble4 +( + XMUNIBBLE4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + static const XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f}; + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = (((uint16_t)tmp.w & 0xF) << 12) | + (((uint16_t)tmp.z & 0xF) << 8) | + (((uint16_t)tmp.y & 0xF) << 4) | + (((uint16_t)tmp.x & 0xF)); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f}; + static const XMVECTORF32 Scale = {1.0f,16.f,16.f*16.f,16.f*16.f*16.f}; + static const XMVECTORU32 Mask = {0xF,0xF<<4,0xF<<8,0xF<<12}; + float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0)); + vResult = vminq_f32(vResult,Max); + vResult = vmulq_f32(vResult,Scale); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti,Mask); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vhi = vget_high_u32(vResulti); + vTemp = vorr_u32( vTemp, vhi ); + vTemp = vpadd_u32( vTemp, vTemp ); + vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); + uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); + uint16_t z = static_cast(_mm_extract_epi16(vInt,4)); + uint16_t w = static_cast(_mm_extract_epi16(vInt,6)); + pDestination->v = ((w & 0xF) << 12) | + ((z & 0xF) << 8) | + ((y & 0xF) << 4) | + ((x & 0xF)); +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XM_CALLCONV PackedVector::XMStoreU555 +( + XMU555* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + static const XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f}; + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((tmp.w > 0.f) ? 0x8000 : 0) | + (((uint16_t)tmp.z & 0x1F) << 10) | + (((uint16_t)tmp.y & 0x1F) << 5) | + (((uint16_t)tmp.x & 0x1F)); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f}; + static const XMVECTORF32 Scale = {1.0f,32.f/2.f,32.f*32.f,32.f*32.f*32.f/2.f}; + static const XMVECTORU32 Mask = {0x1F,0x1F<<(5-1),0x1F<<10,0x1<<(15-1)}; + float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0)); + vResult = vminq_f32(vResult,Max); + vResult = vmulq_f32(vResult,Scale); + uint32x4_t vResulti = vcvtq_u32_f32(vResult); + vResulti = vandq_u32(vResulti,Mask); + // Do a horizontal or of 4 entries + uint32x2_t vTemp = vget_low_u32(vResulti); + uint32x2_t vTemp2 = vget_high_u32(vResulti); + vTemp = vorr_u32( vTemp, vTemp2 ); + // Perform a single bit left shift on y|w + vTemp2 = vdup_lane_u32( vTemp, 1 ); + vTemp2 = vadd_s32( vTemp2, vTemp2 ); + vTemp = vorr_u32( vTemp, vTemp2 ); + vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + uint16_t x = static_cast(_mm_extract_epi16(vInt,0)); + uint16_t y = static_cast(_mm_extract_epi16(vInt,2)); + uint16_t z = static_cast(_mm_extract_epi16(vInt,4)); + uint16_t w = static_cast(_mm_extract_epi16(vInt,6)); + pDestination->v = ((w) ? 0x8000 : 0) | + ((z & 0x1F) << 10) | + ((y & 0x1F) << 5) | + ((x & 0x1F)); +#endif +} + + +/**************************************************************************** + * + * XMCOLOR operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMCOLOR::XMCOLOR +( + float _r, + float _g, + float _b, + float _a +) +{ + XMStoreColor(this, XMVectorSet(_r, _g, _b, _a)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMCOLOR::XMCOLOR +( + const float* pArray +) +{ + XMStoreColor(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMHALF2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMHALF2::XMHALF2 +( + float _x, + float _y +) +{ + x = XMConvertFloatToHalf(_x); + y = XMConvertFloatToHalf(_y); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMHALF2::XMHALF2 +( + const float* pArray +) +{ + assert( pArray != nullptr ); + x = XMConvertFloatToHalf(pArray[0]); + y = XMConvertFloatToHalf(pArray[1]); +} + +/**************************************************************************** + * + * XMSHORTN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMSHORTN2::XMSHORTN2 +( + float _x, + float _y +) +{ + XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMSHORTN2::XMSHORTN2 +( + const float* pArray +) +{ + XMStoreShortN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMSHORT2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMSHORT2::XMSHORT2 +( + float _x, + float _y +) +{ + XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMSHORT2::XMSHORT2 +( + const float* pArray +) +{ + XMStoreShort2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORTN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUSHORTN2::XMUSHORTN2 +( + float _x, + float _y +) +{ + XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUSHORTN2::XMUSHORTN2 +( + const float* pArray +) +{ + XMStoreUShortN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORT2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUSHORT2::XMUSHORT2 +( + float _x, + float _y +) +{ + XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUSHORT2::XMUSHORT2 +( + const float* pArray +) +{ + XMStoreUShort2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTEN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMBYTEN2::XMBYTEN2 +( + float _x, + float _y +) +{ + XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMBYTEN2::XMBYTEN2 +( + const float* pArray +) +{ + XMStoreByteN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTE2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMBYTE2::XMBYTE2 +( + float _x, + float _y +) +{ + XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMBYTE2::XMBYTE2 +( + const float* pArray +) +{ + XMStoreByte2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTEN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUBYTEN2::XMUBYTEN2 +( + float _x, + float _y +) +{ + XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUBYTEN2::XMUBYTEN2 +( + const float* pArray +) +{ + XMStoreUByteN2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTE2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUBYTE2::XMUBYTE2 +( + float _x, + float _y +) +{ + XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUBYTE2::XMUBYTE2 +( + const float* pArray +) +{ + XMStoreUByte2(this, XMLoadFloat2(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMU565 operators + * + ****************************************************************************/ + +inline PackedVector::XMU565::XMU565 +( + float _x, + float _y, + float _z +) +{ + XMStoreU565(this, XMVectorSet( _x, _y, _z, 0.0f )); +} + +_Use_decl_annotations_ +inline PackedVector::XMU565::XMU565 +( + const float *pArray +) +{ + XMStoreU565(this, XMLoadFloat3(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMFLOAT3PK operators + * + ****************************************************************************/ + +inline PackedVector::XMFLOAT3PK::XMFLOAT3PK +( + float _x, + float _y, + float _z +) +{ + XMStoreFloat3PK(this, XMVectorSet( _x, _y, _z, 0.0f )); +} + +_Use_decl_annotations_ +inline PackedVector::XMFLOAT3PK::XMFLOAT3PK +( + const float *pArray +) +{ + XMStoreFloat3PK(this, XMLoadFloat3(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMFLOAT3SE operators + * + ****************************************************************************/ + +inline PackedVector::XMFLOAT3SE::XMFLOAT3SE +( + float _x, + float _y, + float _z +) +{ + XMStoreFloat3SE(this, XMVectorSet( _x, _y, _z, 0.0f )); +} + +_Use_decl_annotations_ +inline PackedVector::XMFLOAT3SE::XMFLOAT3SE +( + const float *pArray +) +{ + XMStoreFloat3SE(this, XMLoadFloat3(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMHALF4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMHALF4::XMHALF4 +( + float _x, + float _y, + float _z, + float _w +) +{ + x = XMConvertFloatToHalf(_x); + y = XMConvertFloatToHalf(_y); + z = XMConvertFloatToHalf(_z); + w = XMConvertFloatToHalf(_w); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline PackedVector::XMHALF4::XMHALF4 +( + const float* pArray +) +{ + XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(float), 4); +} + +/**************************************************************************** + * + * XMSHORTN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMSHORTN4::XMSHORTN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMSHORTN4::XMSHORTN4 +( + const float* pArray +) +{ + XMStoreShortN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMSHORT4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMSHORT4::XMSHORT4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMSHORT4::XMSHORT4 +( + const float* pArray +) +{ + XMStoreShort4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORTN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUSHORTN4::XMUSHORTN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUSHORTN4::XMUSHORTN4 +( + const float* pArray +) +{ + XMStoreUShortN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUSHORT4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUSHORT4::XMUSHORT4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUSHORT4::XMUSHORT4 +( + const float* pArray +) +{ + XMStoreUShort4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMXDECN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMXDECN4::XMXDECN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMXDECN4::XMXDECN4 +( + const float* pArray +) +{ + XMStoreXDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMXDEC4 operators + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable : 4996) +// C4996: ignore deprecation warning + +//------------------------------------------------------------------------------ + +inline PackedVector::XMXDEC4::XMXDEC4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMXDEC4::XMXDEC4 +( + const float* pArray +) +{ + XMStoreXDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMDECN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMDECN4::XMDECN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMDECN4::XMDECN4 +( + const float* pArray +) +{ + XMStoreDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMDEC4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMDEC4::XMDEC4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMDEC4::XMDEC4 +( + const float* pArray +) +{ + XMStoreDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +#pragma warning(pop) + +/**************************************************************************** + * + * XMUDECN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUDECN4::XMUDECN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUDECN4::XMUDECN4 +( + const float* pArray +) +{ + XMStoreUDecN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUDEC4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUDEC4::XMUDEC4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUDEC4::XMUDEC4 +( + const float* pArray +) +{ + XMStoreUDec4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTEN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMBYTEN4::XMBYTEN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMBYTEN4::XMBYTEN4 +( + const float* pArray +) +{ + XMStoreByteN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMBYTE4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMBYTE4::XMBYTE4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMBYTE4::XMBYTE4 +( + const float* pArray +) +{ + XMStoreByte4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTEN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUBYTEN4::XMUBYTEN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUBYTEN4::XMUBYTEN4 +( + const float* pArray +) +{ + XMStoreUByteN4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUBYTE4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUBYTE4::XMUBYTE4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUBYTE4::XMUBYTE4 +( + const float* pArray +) +{ + XMStoreUByte4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMUNIBBLE4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUNIBBLE4::XMUNIBBLE4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUNibble4(this, XMVectorSet( _x, _y, _z, _w )); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUNIBBLE4::XMUNIBBLE4 +( + const float *pArray +) +{ + XMStoreUNibble4(this, XMLoadFloat4(reinterpret_cast(pArray))); +} + +/**************************************************************************** + * + * XMU555 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMU555::XMU555 +( + float _x, + float _y, + float _z, + bool _w +) +{ + XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f) )); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMU555::XMU555 +( + const float *pArray, + bool _w +) +{ + XMVECTOR V = XMLoadFloat3(reinterpret_cast(pArray)); + XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f) )); +} + + diff --git a/MIT.txt b/MIT.txt index 96e5e14..1abfa2b 100644 --- a/MIT.txt +++ b/MIT.txt @@ -1,21 +1,21 @@ - The MIT License (MIT) - -Copyright (c) 2016 Microsoft Corp - -Permission is hereby granted, free of charge, to any person obtaining a copy of this -software and associated documentation files (the "Software"), to deal in the Software -without restriction, including without limitation the rights to use, copy, modify, -merge, publish, distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice shall be included in all copies -or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A -PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE -OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - + The MIT License (MIT) + +Copyright (c) 2016 Microsoft Corp + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be included in all copies +or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/ReadMe.txt b/ReadMe.txt index efd995b..49a4730 100644 --- a/ReadMe.txt +++ b/ReadMe.txt @@ -1,131 +1,131 @@ ------------ -DirectXMath ------------ - -Copyright (c) Microsoft Corporation. All rights reserved. - -June 2016 - -This package contains the DirectXMath library, an all inline SIMD C++ linear algebra library +----------- +DirectXMath +----------- + +Copyright (c) Microsoft Corporation. All rights reserved. + +June 2016 + +This package contains the DirectXMath library, an all inline SIMD C++ linear algebra library for use in games and graphics apps - - -This code is designed to build with Visual Studio 2013 or 2015. It is recommended that you -make use of VS 2013 Update 5 or VS 2015 Update 2. - -These components are designed to work without requiring any content from the DirectX SDK. For details, -see "Where is the DirectX SDK?" . - -Inc\ - DirectXMath Files (in the DirectX C++ namespace) - DirectXMath.h - Core library - DirectXPackedVector.h - Load/Store functions and types for working with various compressed GPU formats - DirectXColors.h - .NET-style Color defines in sRGB color space - DirectXCollision.h - Bounding volume collision library - -Extentions\ - Advanced instruction set variants for guarded codepaths - DirectXMathSSE3.h - SSE3 - DirectXMathBE.h - Supplemental SSE3 (SSSE3) - DirectXMathSSE4.h - SSE4.1 - DirectXMathAVX.h - Advanced Vector Extensions (AVX) - DirectXMathAVX2.h - Advanced Vector Extensions 2 (AVX2) - DirectXMathF16C.h - Half-precision conversions (F16C) - DirectXMathFMA3.h - Fused multiply-accumulate (FMA3) - DirectXMathFMA4.h - Fused multiply-accumulate (FMA4) - -SHMath\ - Spherical Harmonics math functions - DirectXSH.h - Header for SHMath functions - DirectXSH.cpp, DirectXSHD3D11.cpp - Implementation - -XDSP\ - XDSP.h - Digital Signal Processing helper functions - -All content and source code for this package are subject to the terms of the MIT License. -. - -Documentation is available at . - -For the latest version of DirectXMath, bug reports, etc. please visit the project site. - - -This project has adopted the Microsoft Open Source Code of Conduct. For more information see the -Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments. - -https://opensource.microsoft.com/codeofconduct/ - - ---------------- -RELEASE HISTORY ---------------- - -June 2016 (3.09) - Includes support for additional optimizations when built with /arch:AVX or /arch:AVX2 - Added use of constexpr for type constructors, XMConvertToRadians, and XMConvertToDegrees - Marked __vector4i, XMXDEC4, XMDECN4, XMDEC4, and associated Load & Store functions as deprecated. - These are vestiges of Xbox 360 support and will be removed in a future release - Renamed parameter in XMMatrixPerspectiveFov* to reduce user confusion when relying on IntelliSense - XMU565, XMUNIBBLE4 constructors take uint8_t instead of int8_t - -May 2016 - DirectXMath 3.08 released under the MIT license - -November 2015 (3.08) - Added use of _mm_sfence for Stream methods - Fixed bug with non-uniform scaling transforms for BoundingOrientedBox - Added asserts for Near/FarZ in XMMatrix* methods - Added use of =default for PODs with VS 2013/2015 - Additional SSE and ARM-NEON optimizations for PackedVector functions - -April 2015 (3.07) - Fix customer reported bugs in BoundingBox methods - Fix customer reported bug in XMStoreFloat3SE - Fix customer reported bug in XMVectorATan2, XMVectorATan2Est - Fix customer reported bug in XMVectorRound - -October 2013 (3.06) - Fixed load/store of XMFLOAT3SE to properly match the DXGI_FORMAT_R9G9B9E5_SHAREDEXP - Added XMLoadUDecN4_XR and XMStoreUDecN4_XR to match DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM - Added XMColorRGBToSRGB and XMColorSRGBToRGB to convert linear RGB <-> sRGB - -July 2013 (3.05) - Use x86/x64 __vectorcall calling-convention when available (XM_CALLCONV, HXMVECTOR, FXMMATRIX introduced) - Fixed bug with XMVectorFloor and XMVectorCeiling when given whole odd numbers (i.e. 105.0) - Improved XMVectorRound algorithm - ARM-NEON optimizations for XMVectorExp2, XMVectorLog2, XMVectorExpE, and XMVectorLogE - ARM-NEON code paths use multiply-by-scalar intrinsics when supported - Additional optimizations for ARM-NEON Stream functions - Fixed potential warning C4723 using operator/ or operator/= - -March 2013 (3.04) - XMVectorExp2, XMVectorLog2, XMVectorExpE, and XMVectorLogE functions added to provide base-e support in addition to the existing base-2 support - XMVectorExp and XMVectorLog are now aliases for XMVectorExp2 and XMVectorLog2 - Additional optimizations for Stream functions - XMVector3Cross now ensures w component is zero on ARM - XMConvertHalfToFloat and XMConvertFloatToHalf now use IEEE 754 standard float16 behavior for INF/QNAN - Updated matrix version Transform for BoundingOrientedBox and BoundingFrustum to handle scaling - -March 2012 (3.03) - Breaking change: Removed union members from XMMATRIX type to make it a fully 'opaque' type - Marked single-parameter C++ constructors for XMFLOAT2, XMFLOAT2A, XMFLOAT3, XMFLOAT3A, XMFLOAT4, and XMFLOAT4A explicit - -February 2012 (3.02) - ARM-NEON intrinsics (selected by default for the ARM platform) - reworked XMVectorPermute, change of XM_PERMUTE_ defines, removal of XMVectorPermuteControl - Addition of XM_SWIZZLE_ defines - Optimizations for transcendental functions - Template forms for permute, swizzle, shift-left, rotate-left, rotation-right, and insert - Removal of deprecated types and functions - (XM_CACHE_LINE_SIZE define, XMVectorExpEst, XMVectorLogEst, XMVectorPowEst, XMVectorSinHEs, XMVectorCosHEst, XMVectorTanHEst, - XMVector2InBoundsR, XMVector3InBoundsR, XMVector4InBoundsR) - Removed XM_STRICT_VECTOR4; XMVECTOR in NO-INTRINSICS always defined without .x, .y, .z, .w, .v, or .u - Additional bounding types - SAL fixes and improvements - -September 2011 (3.00) - Renamed and reorganized the headers - Introduced C++ namespaces - Removed the Xbox 360-specific GPU types - (HENDN3, XMHEND3, XMUHENDN3, XMUHEND3, XMDHENN3, XMDHEN3, - XMUDHENN3, XMUDHEN3, XMXICON4, XMXICO4, XMICON4, XMICO4, XMUICON4, XMUICO4 ) + + +This code is designed to build with Visual Studio 2013 or 2015. It is recommended that you +make use of VS 2013 Update 5 or VS 2015 Update 2. + +These components are designed to work without requiring any content from the DirectX SDK. For details, +see "Where is the DirectX SDK?" . + +Inc\ + DirectXMath Files (in the DirectX C++ namespace) + DirectXMath.h - Core library + DirectXPackedVector.h - Load/Store functions and types for working with various compressed GPU formats + DirectXColors.h - .NET-style Color defines in sRGB color space + DirectXCollision.h - Bounding volume collision library + +Extentions\ + Advanced instruction set variants for guarded codepaths + DirectXMathSSE3.h - SSE3 + DirectXMathBE.h - Supplemental SSE3 (SSSE3) + DirectXMathSSE4.h - SSE4.1 + DirectXMathAVX.h - Advanced Vector Extensions (AVX) + DirectXMathAVX2.h - Advanced Vector Extensions 2 (AVX2) + DirectXMathF16C.h - Half-precision conversions (F16C) + DirectXMathFMA3.h - Fused multiply-accumulate (FMA3) + DirectXMathFMA4.h - Fused multiply-accumulate (FMA4) + +SHMath\ + Spherical Harmonics math functions + DirectXSH.h - Header for SHMath functions + DirectXSH.cpp, DirectXSHD3D11.cpp - Implementation + +XDSP\ + XDSP.h - Digital Signal Processing helper functions + +All content and source code for this package are subject to the terms of the MIT License. +. + +Documentation is available at . + +For the latest version of DirectXMath, bug reports, etc. please visit the project site. + + +This project has adopted the Microsoft Open Source Code of Conduct. For more information see the +Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments. + +https://opensource.microsoft.com/codeofconduct/ + + +--------------- +RELEASE HISTORY +--------------- + +June 2016 (3.09) + Includes support for additional optimizations when built with /arch:AVX or /arch:AVX2 + Added use of constexpr for type constructors, XMConvertToRadians, and XMConvertToDegrees + Marked __vector4i, XMXDEC4, XMDECN4, XMDEC4, and associated Load & Store functions as deprecated. + These are vestiges of Xbox 360 support and will be removed in a future release + Renamed parameter in XMMatrixPerspectiveFov* to reduce user confusion when relying on IntelliSense + XMU565, XMUNIBBLE4 constructors take uint8_t instead of int8_t + +May 2016 + DirectXMath 3.08 released under the MIT license + +November 2015 (3.08) + Added use of _mm_sfence for Stream methods + Fixed bug with non-uniform scaling transforms for BoundingOrientedBox + Added asserts for Near/FarZ in XMMatrix* methods + Added use of =default for PODs with VS 2013/2015 + Additional SSE and ARM-NEON optimizations for PackedVector functions + +April 2015 (3.07) + Fix customer reported bugs in BoundingBox methods + Fix customer reported bug in XMStoreFloat3SE + Fix customer reported bug in XMVectorATan2, XMVectorATan2Est + Fix customer reported bug in XMVectorRound + +October 2013 (3.06) + Fixed load/store of XMFLOAT3SE to properly match the DXGI_FORMAT_R9G9B9E5_SHAREDEXP + Added XMLoadUDecN4_XR and XMStoreUDecN4_XR to match DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM + Added XMColorRGBToSRGB and XMColorSRGBToRGB to convert linear RGB <-> sRGB + +July 2013 (3.05) + Use x86/x64 __vectorcall calling-convention when available (XM_CALLCONV, HXMVECTOR, FXMMATRIX introduced) + Fixed bug with XMVectorFloor and XMVectorCeiling when given whole odd numbers (i.e. 105.0) + Improved XMVectorRound algorithm + ARM-NEON optimizations for XMVectorExp2, XMVectorLog2, XMVectorExpE, and XMVectorLogE + ARM-NEON code paths use multiply-by-scalar intrinsics when supported + Additional optimizations for ARM-NEON Stream functions + Fixed potential warning C4723 using operator/ or operator/= + +March 2013 (3.04) + XMVectorExp2, XMVectorLog2, XMVectorExpE, and XMVectorLogE functions added to provide base-e support in addition to the existing base-2 support + XMVectorExp and XMVectorLog are now aliases for XMVectorExp2 and XMVectorLog2 + Additional optimizations for Stream functions + XMVector3Cross now ensures w component is zero on ARM + XMConvertHalfToFloat and XMConvertFloatToHalf now use IEEE 754 standard float16 behavior for INF/QNAN + Updated matrix version Transform for BoundingOrientedBox and BoundingFrustum to handle scaling + +March 2012 (3.03) + Breaking change: Removed union members from XMMATRIX type to make it a fully 'opaque' type + Marked single-parameter C++ constructors for XMFLOAT2, XMFLOAT2A, XMFLOAT3, XMFLOAT3A, XMFLOAT4, and XMFLOAT4A explicit + +February 2012 (3.02) + ARM-NEON intrinsics (selected by default for the ARM platform) + reworked XMVectorPermute, change of XM_PERMUTE_ defines, removal of XMVectorPermuteControl + Addition of XM_SWIZZLE_ defines + Optimizations for transcendental functions + Template forms for permute, swizzle, shift-left, rotate-left, rotation-right, and insert + Removal of deprecated types and functions + (XM_CACHE_LINE_SIZE define, XMVectorExpEst, XMVectorLogEst, XMVectorPowEst, XMVectorSinHEs, XMVectorCosHEst, XMVectorTanHEst, + XMVector2InBoundsR, XMVector3InBoundsR, XMVector4InBoundsR) + Removed XM_STRICT_VECTOR4; XMVECTOR in NO-INTRINSICS always defined without .x, .y, .z, .w, .v, or .u + Additional bounding types + SAL fixes and improvements + +September 2011 (3.00) + Renamed and reorganized the headers + Introduced C++ namespaces + Removed the Xbox 360-specific GPU types + (HENDN3, XMHEND3, XMUHENDN3, XMUHEND3, XMDHENN3, XMDHEN3, + XMUDHENN3, XMUDHEN3, XMXICON4, XMXICO4, XMICON4, XMICO4, XMUICON4, XMUICO4 ) diff --git a/SHMath/DirectXSH.cpp b/SHMath/DirectXSH.cpp index d66a35a..c4191b7 100644 --- a/SHMath/DirectXSH.cpp +++ b/SHMath/DirectXSH.cpp @@ -1,4868 +1,4868 @@ -//------------------------------------------------------------------------------------- -// DirectXSH.cpp -- C++ Spherical Harmonics Math Library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/p/?LinkId=262885 -//------------------------------------------------------------------------------------- - -#include "DirectXSH.h" -#include - -using namespace DirectX; - -#pragma warning( disable : 4619 4456 ) - -namespace -{ - #pragma prefast(disable:246, "generated code by maple (nested const variable definitions)") - - static const float fExtraNormFac[XM_SH_MAXORDER] = { 2.0f*sqrtf(XM_PI), 2.0f/3.0f*sqrtf(3.0f*XM_PI), 2.0f/5.0f*sqrtf(5.0f*XM_PI), 2.0f/7.0f*sqrtf(7.0f*XM_PI), 2.0f/3.0f*sqrtf(XM_PI), 2.0f/11.0f*sqrtf(11.0f*XM_PI) }; - - // computes the integral of a constant function over a solid angular - // extent. No error checking - only used internaly. This function - // only returns the Yl0 coefficients, since the rest are zero for - // circularly symmetric functions. - static const float ComputeCapInt_t1 = sqrtf(0.3141593E1f); - static const float ComputeCapInt_t5 = sqrtf(3.0f); - static const float ComputeCapInt_t11 = sqrtf(5.0f); - static const float ComputeCapInt_t18 = sqrtf(7.0f); - static const float ComputeCapInt_t32 = sqrtf(11.0f); - - static inline void ComputeCapInt(const size_t order, float angle, float *pR) - { - const float t2 = cosf(angle); - const float t3 = ComputeCapInt_t1*t2; - const float t7 = sinf(angle); - const float t8 = t7*t7; - - - pR[0] = -t3+ComputeCapInt_t1; - pR[1] = ComputeCapInt_t5*ComputeCapInt_t1*t8/2.0f; - - if (order > 2) - { - const float t13 = t2*t2; - - pR[2] = -ComputeCapInt_t11*ComputeCapInt_t1*t2*(t13-1.0f)/2.0f; - if (order > 3) - { - const float t19 = ComputeCapInt_t18*ComputeCapInt_t1; - const float t20 = t13*t13; - - pR[3] = -5.0f/8.0f*t19*t20+3.0f/4.0f*t19*t13-t19/8.0f; - if (order > 4) - { - - - pR[4] = -3.0f/8.0f*t3*(7.0f*t20-10.0f*t13+3.0f); - if (order > 5) - { - const float t33 = ComputeCapInt_t32*ComputeCapInt_t1; - pR[5] = -21.0f/16.0f*t33*t20*t13+35.0f/16.0f*t33*t20-15.0f/16.0f*t33*t13+t33/16.0f; - } - } - } - } - } - - // input pF only consists of Yl0 values, normalizes coefficients for directional - // lights. - static inline float CosWtInt(const size_t order) - { - const float fCW0 = 0.25f; - const float fCW1 = 0.5f; - const float fCW2 = 5.0f/16.0f; - //const float fCW3 = 0.0f; - const float fCW4 = -3.0f/32.0f; - //const float fCW5 = 0.0f; - - // order has to be at least linear... - - float fRet = fCW0 + fCW1; - - if (order > 2) fRet += fCW2; - if (order > 4) fRet += fCW4; - - // odd degrees >= 3 evaluate to zero integrated against cosine... - - return fRet; - } - - static const float SHEvalHemisphereLight_fSqrtPi = sqrtf(XM_PI); - static const float SHEvalHemisphereLight_fSqrtPi3 = sqrtf(XM_PI/3.0f); - - typedef float REAL; - #define CONSTANT(x) (x ## f) - - // routine generated programmatically for evaluating SH basis for degree 1 - // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) - // output is vector b with SH basis evaluated at (x,y,z). - // - inline static void sh_eval_basis_1(REAL x,REAL y,REAL z,REAL b[4]) - { - /* m=0 */ - - // l=0 - const REAL p_0_0 = CONSTANT(0.282094791773878140); - b[ 0] = p_0_0; // l=0,m=0 - // l=1 - const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; - b[ 2] = p_1_0; // l=1,m=0 - - - /* m=1 */ - - const REAL s1 = y; - const REAL c1 = x; - - // l=1 - const REAL p_1_1 = CONSTANT(-0.488602511902919920); - b[ 1] = p_1_1*s1; // l=1,m=-1 - b[ 3] = p_1_1*c1; // l=1,m=+1 - } - - // routine generated programmatically for evaluating SH basis for degree 2 - // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) - // output is vector b with SH basis evaluated at (x,y,z). - // - inline static void sh_eval_basis_2(REAL x,REAL y,REAL z,REAL b[9]) - { - const REAL z2 = z*z; - - - /* m=0 */ - - // l=0 - const REAL p_0_0 = CONSTANT(0.282094791773878140); - b[ 0] = p_0_0; // l=0,m=0 - // l=1 - const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; - b[ 2] = p_1_0; // l=1,m=0 - // l=2 - const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050); - b[ 6] = p_2_0; // l=2,m=0 - - - /* m=1 */ - - const REAL s1 = y; - const REAL c1 = x; - - // l=1 - const REAL p_1_1 = CONSTANT(-0.488602511902919920); - b[ 1] = p_1_1*s1; // l=1,m=-1 - b[ 3] = p_1_1*c1; // l=1,m=+1 - // l=2 - const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z; - b[ 5] = p_2_1*s1; // l=2,m=-1 - b[ 7] = p_2_1*c1; // l=2,m=+1 - - - /* m=2 */ - - const REAL s2 = x*s1 + y*c1; - const REAL c2 = x*c1 - y*s1; - - // l=2 - const REAL p_2_2 = CONSTANT(0.546274215296039590); - b[ 4] = p_2_2*s2; // l=2,m=-2 - b[ 8] = p_2_2*c2; // l=2,m=+2 - } - - // routine generated programmatically for evaluating SH basis for degree 3 - // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) - // output is vector b with SH basis evaluated at (x,y,z). - // - static void sh_eval_basis_3(REAL x,REAL y,REAL z,REAL b[16]) - { - const REAL z2 = z*z; - - - /* m=0 */ - - // l=0 - const REAL p_0_0 = CONSTANT(0.282094791773878140); - b[ 0] = p_0_0; // l=0,m=0 - // l=1 - const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; - b[ 2] = p_1_0; // l=1,m=0 - // l=2 - const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050); - b[ 6] = p_2_0; // l=2,m=0 - // l=3 - const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200)); - b[ 12] = p_3_0; // l=3,m=0 - - - /* m=1 */ - - const REAL s1 = y; - const REAL c1 = x; - - // l=1 - const REAL p_1_1 = CONSTANT(-0.488602511902919920); - b[ 1] = p_1_1*s1; // l=1,m=-1 - b[ 3] = p_1_1*c1; // l=1,m=+1 - // l=2 - const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z; - b[ 5] = p_2_1*s1; // l=2,m=-1 - b[ 7] = p_2_1*c1; // l=2,m=+1 - // l=3 - const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770); - b[ 11] = p_3_1*s1; // l=3,m=-1 - b[ 13] = p_3_1*c1; // l=3,m=+1 - - - /* m=2 */ - - const REAL s2 = x*s1 + y*c1; - const REAL c2 = x*c1 - y*s1; - - // l=2 - const REAL p_2_2 = CONSTANT(0.546274215296039590); - b[ 4] = p_2_2*s2; // l=2,m=-2 - b[ 8] = p_2_2*c2; // l=2,m=+2 - // l=3 - const REAL p_3_2 = CONSTANT(1.445305721320277100)*z; - b[ 10] = p_3_2*s2; // l=3,m=-2 - b[ 14] = p_3_2*c2; // l=3,m=+2 - - - /* m=3 */ - - const REAL s3 = x*s2 + y*c2; - const REAL c3 = x*c2 - y*s2; - - // l=3 - const REAL p_3_3 = CONSTANT(-0.590043589926643520); - b[ 9] = p_3_3*s3; // l=3,m=-3 - b[ 15] = p_3_3*c3; // l=3,m=+3 - } - - // routine generated programmatically for evaluating SH basis for degree 4 - // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) - // output is vector b with SH basis evaluated at (x,y,z). - // - static void sh_eval_basis_4(REAL x,REAL y,REAL z,REAL b[25]) - { - const REAL z2 = z*z; - - - /* m=0 */ - - // l=0 - const REAL p_0_0 = CONSTANT(0.282094791773878140); - b[ 0] = p_0_0; // l=0,m=0 - // l=1 - const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; - b[ 2] = p_1_0; // l=1,m=0 - // l=2 - const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050); - b[ 6] = p_2_0; // l=2,m=0 - // l=3 - const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200)); - b[ 12] = p_3_0; // l=3,m=0 - // l=4 - const REAL p_4_0 = CONSTANT(1.984313483298443000)*z*p_3_0 + CONSTANT(-1.006230589874905300)*p_2_0; - b[ 20] = p_4_0; // l=4,m=0 - - - /* m=1 */ - - const REAL s1 = y; - const REAL c1 = x; - - // l=1 - const REAL p_1_1 = CONSTANT(-0.488602511902919920); - b[ 1] = p_1_1*s1; // l=1,m=-1 - b[ 3] = p_1_1*c1; // l=1,m=+1 - // l=2 - const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z; - b[ 5] = p_2_1*s1; // l=2,m=-1 - b[ 7] = p_2_1*c1; // l=2,m=+1 - // l=3 - const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770); - b[ 11] = p_3_1*s1; // l=3,m=-1 - b[ 13] = p_3_1*c1; // l=3,m=+1 - // l=4 - const REAL p_4_1 = z*(CONSTANT(-4.683325804901024000)*z2 + CONSTANT(2.007139630671867200)); - b[ 19] = p_4_1*s1; // l=4,m=-1 - b[ 21] = p_4_1*c1; // l=4,m=+1 - - - /* m=2 */ - - const REAL s2 = x*s1 + y*c1; - const REAL c2 = x*c1 - y*s1; - - // l=2 - const REAL p_2_2 = CONSTANT(0.546274215296039590); - b[ 4] = p_2_2*s2; // l=2,m=-2 - b[ 8] = p_2_2*c2; // l=2,m=+2 - // l=3 - const REAL p_3_2 = CONSTANT(1.445305721320277100)*z; - b[ 10] = p_3_2*s2; // l=3,m=-2 - b[ 14] = p_3_2*c2; // l=3,m=+2 - // l=4 - const REAL p_4_2 = CONSTANT(3.311611435151459800)*z2 + CONSTANT(-0.473087347878779980); - b[ 18] = p_4_2*s2; // l=4,m=-2 - b[ 22] = p_4_2*c2; // l=4,m=+2 - - - /* m=3 */ - - const REAL s3 = x*s2 + y*c2; - const REAL c3 = x*c2 - y*s2; - - // l=3 - const REAL p_3_3 = CONSTANT(-0.590043589926643520); - b[ 9] = p_3_3*s3; // l=3,m=-3 - b[ 15] = p_3_3*c3; // l=3,m=+3 - // l=4 - const REAL p_4_3 = CONSTANT(-1.770130769779930200)*z; - b[ 17] = p_4_3*s3; // l=4,m=-3 - b[ 23] = p_4_3*c3; // l=4,m=+3 - - - /* m=4 */ - - const REAL s4 = x*s3 + y*c3; - const REAL c4 = x*c3 - y*s3; - - // l=4 - const REAL p_4_4 = CONSTANT(0.625835735449176030); - b[ 16] = p_4_4*s4; // l=4,m=-4 - b[ 24] = p_4_4*c4; // l=4,m=+4 - } - - // routine generated programmatically for evaluating SH basis for degree 5 - // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) - // output is vector b with SH basis evaluated at (x,y,z). - // - static void sh_eval_basis_5(REAL x,REAL y,REAL z,REAL b[36]) - { - const REAL z2 = z*z; - - - /* m=0 */ - - // l=0 - const REAL p_0_0 = CONSTANT(0.282094791773878140); - b[ 0] = p_0_0; // l=0,m=0 - // l=1 - const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; - b[ 2] = p_1_0; // l=1,m=0 - // l=2 - const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050); - b[ 6] = p_2_0; // l=2,m=0 - // l=3 - const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200)); - b[ 12] = p_3_0; // l=3,m=0 - // l=4 - const REAL p_4_0 = CONSTANT(1.984313483298443000)*z*p_3_0 + CONSTANT(-1.006230589874905300)*p_2_0; - b[ 20] = p_4_0; // l=4,m=0 - // l=5 - const REAL p_5_0 = CONSTANT(1.989974874213239700)*z*p_4_0 + CONSTANT(-1.002853072844814000)*p_3_0; - b[ 30] = p_5_0; // l=5,m=0 - - - /* m=1 */ - - const REAL s1 = y; - const REAL c1 = x; - - // l=1 - const REAL p_1_1 = CONSTANT(-0.488602511902919920); - b[ 1] = p_1_1*s1; // l=1,m=-1 - b[ 3] = p_1_1*c1; // l=1,m=+1 - // l=2 - const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z; - b[ 5] = p_2_1*s1; // l=2,m=-1 - b[ 7] = p_2_1*c1; // l=2,m=+1 - // l=3 - const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770); - b[ 11] = p_3_1*s1; // l=3,m=-1 - b[ 13] = p_3_1*c1; // l=3,m=+1 - // l=4 - const REAL p_4_1 = z*(CONSTANT(-4.683325804901024000)*z2 + CONSTANT(2.007139630671867200)); - b[ 19] = p_4_1*s1; // l=4,m=-1 - b[ 21] = p_4_1*c1; // l=4,m=+1 - // l=5 - const REAL p_5_1 = CONSTANT(2.031009601158990200)*z*p_4_1 + CONSTANT(-0.991031208965114650)*p_3_1; - b[ 29] = p_5_1*s1; // l=5,m=-1 - b[ 31] = p_5_1*c1; // l=5,m=+1 - - - /* m=2 */ - - const REAL s2 = x*s1 + y*c1; - const REAL c2 = x*c1 - y*s1; - - // l=2 - const REAL p_2_2 = CONSTANT(0.546274215296039590); - b[ 4] = p_2_2*s2; // l=2,m=-2 - b[ 8] = p_2_2*c2; // l=2,m=+2 - // l=3 - const REAL p_3_2 = CONSTANT(1.445305721320277100)*z; - b[ 10] = p_3_2*s2; // l=3,m=-2 - b[ 14] = p_3_2*c2; // l=3,m=+2 - // l=4 - const REAL p_4_2 = CONSTANT(3.311611435151459800)*z2 + CONSTANT(-0.473087347878779980); - b[ 18] = p_4_2*s2; // l=4,m=-2 - b[ 22] = p_4_2*c2; // l=4,m=+2 - // l=5 - const REAL p_5_2 = z*(CONSTANT(7.190305177459987500)*z2 + CONSTANT(-2.396768392486662100)); - b[ 28] = p_5_2*s2; // l=5,m=-2 - b[ 32] = p_5_2*c2; // l=5,m=+2 - - - /* m=3 */ - - const REAL s3 = x*s2 + y*c2; - const REAL c3 = x*c2 - y*s2; - - // l=3 - const REAL p_3_3 = CONSTANT(-0.590043589926643520); - b[ 9] = p_3_3*s3; // l=3,m=-3 - b[ 15] = p_3_3*c3; // l=3,m=+3 - // l=4 - const REAL p_4_3 = CONSTANT(-1.770130769779930200)*z; - b[ 17] = p_4_3*s3; // l=4,m=-3 - b[ 23] = p_4_3*c3; // l=4,m=+3 - // l=5 - const REAL p_5_3 = CONSTANT(-4.403144694917253700)*z2 + CONSTANT(0.489238299435250430); - b[ 27] = p_5_3*s3; // l=5,m=-3 - b[ 33] = p_5_3*c3; // l=5,m=+3 - - - /* m=4 */ - - const REAL s4 = x*s3 + y*c3; - const REAL c4 = x*c3 - y*s3; - - // l=4 - const REAL p_4_4 = CONSTANT(0.625835735449176030); - b[ 16] = p_4_4*s4; // l=4,m=-4 - b[ 24] = p_4_4*c4; // l=4,m=+4 - // l=5 - const REAL p_5_4 = CONSTANT(2.075662314881041100)*z; - b[ 26] = p_5_4*s4; // l=5,m=-4 - b[ 34] = p_5_4*c4; // l=5,m=+4 - - - /* m=5 */ - - const REAL s5 = x*s4 + y*c4; - const REAL c5 = x*c4 - y*s4; - - // l=5 - const REAL p_5_5 = CONSTANT(-0.656382056840170150); - b[ 25] = p_5_5*s5; // l=5,m=-5 - b[ 35] = p_5_5*c5; // l=5,m=+5 - } - - static const REAL M_PIjs = (REAL) (4.0*atan(1.0)); - static const REAL maxang = (REAL) (M_PIjs/2); - static const int NSH0 = 1; - static const int NSH1 = 4; - static const int NSH2 = 9; - static const int NSH3 = 16; - static const int NSH4 = 25; - static const int NSH5 = 36; - static const int NSH6 = 49; - static const int NSH7 = 64; - static const int NSH8 = 81; - static const int NSH9 = 100; - static const int NL0 = 1; - static const int NL1 = 3; - static const int NL2 = 5; - static const int NL3 = 7; - static const int NL4 = 9; - static const int NL5 = 11; - static const int NL6 = 13; - static const int NL7 = 15; - static const int NL8 = 17; - static const int NL9 = 19; - - static inline void rot(REAL ct,REAL st,REAL x,REAL y,REAL &xout,REAL &yout) - { - xout = x*ct - y*st; - yout = y*ct + x*st; - } - - static inline void rot_inv(REAL ct,REAL st,REAL x,REAL y,REAL &xout,REAL &yout) - { - xout = x*ct + y*st; - yout = y*ct - x*st; - } - - static inline void rot_1(REAL ct,REAL st,REAL ctm[1],REAL stm[1]) - { - ctm[0] = ct; - stm[0] = st; - } - - static inline void rot_2(REAL ct,REAL st,REAL ctm[2],REAL stm[2]) - { - REAL ct2 = CONSTANT(2.0)*ct; - ctm[0] = ct; - stm[0] = st; - ctm[1] = ct2*ct-CONSTANT(1.0); - stm[1] = ct2*st; - } - - static inline void rot_3(REAL ct,REAL st,REAL ctm[3],REAL stm[3]) - { - REAL ct2 = CONSTANT(2.0)*ct; - ctm[0] = ct; - stm[0] = st; - ctm[1] = ct2*ct-CONSTANT(1.0); - stm[1] = ct2*st; - ctm[2] = ct2*ctm[1] - ct; - stm[2] = ct2*stm[1] - st; - } - - static inline void rot_4(REAL ct,REAL st,REAL ctm[4],REAL stm[4]) - { - REAL ct2 = CONSTANT(2.0)*ct; - ctm[0] = ct; - stm[0] = st; - ctm[1] = ct2*ct-CONSTANT(1.0); - stm[1] = ct2*st; - ctm[2] = ct2*ctm[1] - ct; - stm[2] = ct2*stm[1] - st; - ctm[3] = ct2*ctm[2] - ctm[1]; - stm[3] = ct2*stm[2] - stm[1]; - } - - static inline void rot_5(REAL ct,REAL st,REAL ctm[5],REAL stm[5]) - { - REAL ct2 = CONSTANT(2.0)*ct; - ctm[0] = ct; - stm[0] = st; - ctm[1] = ct2*ct-CONSTANT(1.0); - stm[1] = ct2*st; - ctm[2] = ct2*ctm[1] - ct; - stm[2] = ct2*stm[1] - st; - ctm[3] = ct2*ctm[2] - ctm[1]; - stm[3] = ct2*stm[2] - stm[1]; - ctm[4] = ct2*ctm[3] - ctm[2]; - stm[4] = ct2*stm[3] - stm[2]; - } - - static inline void sh_rotz_1(REAL ctm[1],REAL stm[1],REAL y[NL1],REAL yr[NL1]) - { - yr[1] = y[1]; - rot_inv(ctm[0],stm[0],y[0],y[2],yr[0],yr[2]); - } - - static inline void sh_rotz_2(REAL ctm[2],REAL stm[2],REAL y[NL2],REAL yr[NL2]) - { - yr[2] = y[2]; - rot_inv(ctm[0],stm[0],y[1],y[3],yr[1],yr[3]); - rot_inv(ctm[1],stm[1],y[0],y[4],yr[0],yr[4]); - } - - static inline void sh_rotz_3(REAL ctm[3],REAL stm[3],REAL y[NL3],REAL yr[NL3]) - { - yr[3] = y[3]; - rot_inv(ctm[0],stm[0],y[2],y[4],yr[2],yr[4]); - rot_inv(ctm[1],stm[1],y[1],y[5],yr[1],yr[5]); - rot_inv(ctm[2],stm[2],y[0],y[6],yr[0],yr[6]); - } - - static inline void sh_rotz_4(REAL ctm[4],REAL stm[4],REAL y[NL4],REAL yr[NL4]) - { - yr[4] = y[4]; - rot_inv(ctm[0],stm[0],y[3],y[5],yr[3],yr[5]); - rot_inv(ctm[1],stm[1],y[2],y[6],yr[2],yr[6]); - rot_inv(ctm[2],stm[2],y[1],y[7],yr[1],yr[7]); - rot_inv(ctm[3],stm[3],y[0],y[8],yr[0],yr[8]); - } - - static inline void sh_rotz_5(REAL ctm[5],REAL stm[5],REAL y[NL5],REAL yr[NL5]) - { - yr[5] = y[5]; - rot_inv(ctm[0],stm[0],y[4],y[6],yr[4],yr[6]); - rot_inv(ctm[1],stm[1],y[3],y[7],yr[3],yr[7]); - rot_inv(ctm[2],stm[2],y[2],y[8],yr[2],yr[8]); - rot_inv(ctm[3],stm[3],y[1],y[9],yr[1],yr[9]); - rot_inv(ctm[4],stm[4],y[0],y[10],yr[0],yr[10]); - } - - // rotation code generated programmatically by rotatex (2000x4000 samples, eps=1e-008) - - static REAL fx_1_001 = (REAL) ( sqrt(1.0)/1.0); // 1 - static REAL fx_1_002 = (REAL) (-sqrt(1.0)/1.0); // -1.00000030843 - - static inline void sh_rotx90_1(REAL y[],REAL yr[]) - { - yr[ 0] = fx_1_001*y[ 1]; - yr[ 1] = fx_1_002*y[ 0]; - yr[ 2] = fx_1_001*y[ 2]; - }; - - static inline void sh_rotx90_inv_1(REAL y[],REAL yr[]) - { - yr[ 0] = fx_1_002*y[ 1]; - yr[ 1] = fx_1_001*y[ 0]; - yr[ 2] = fx_1_001*y[ 2]; - } - - static REAL fx_2_001 = (REAL) ( sqrt(4.0)/2.0); // 1 - static REAL fx_2_002 = (REAL) (-sqrt(4.0)/2.0); // -1 - static REAL fx_2_003 = (REAL) (-sqrt(1.0)/2.0); // -0.500000257021 - static REAL fx_2_004 = (REAL) (-sqrt(3.0)/2.0); // -0.866025848959 - static REAL fx_2_005 = (REAL) ( sqrt(1.0)/2.0); // 0.5 - - static inline void sh_rotx90_2(REAL y[],REAL yr[]) - { - yr[ 0] = fx_2_001*y[ 3]; - yr[ 1] = fx_2_002*y[ 1]; - yr[ 2] = fx_2_003*y[ 2]+fx_2_004*y[ 4]; - yr[ 3] = fx_2_002*y[ 0]; - yr[ 4] = fx_2_004*y[ 2]+fx_2_005*y[ 4]; - }; - - static inline void sh_rotx90_inv_2(REAL y[],REAL yr[]) - { - yr[ 0] = fx_2_002*y[ 3]; - yr[ 1] = fx_2_002*y[ 1]; - yr[ 2] = fx_2_003*y[ 2]+fx_2_004*y[ 4]; - yr[ 3] = fx_2_001*y[ 0]; - yr[ 4] = fx_2_004*y[ 2]+fx_2_005*y[ 4]; - } - - static REAL fx_3_001 = (REAL) (-sqrt(10.0)/4.0); // -0.790569415042 - static REAL fx_3_002 = (REAL) ( sqrt(6.0)/4.0); // 0.612372435696 - static REAL fx_3_003 = (REAL) (-sqrt(16.0)/4.0); // -1 - static REAL fx_3_004 = (REAL) (-sqrt(6.0)/4.0); // -0.612372435695 - static REAL fx_3_005 = (REAL) (-sqrt(1.0)/4.0); // -0.25 - static REAL fx_3_006 = (REAL) (-sqrt(15.0)/4.0); // -0.968245836551 - static REAL fx_3_007 = (REAL) ( sqrt(1.0)/4.0); // 0.25 - static REAL fx_3_008 = (REAL) ( sqrt(10.0)/4.0); // 0.790569983984 - - static inline void sh_rotx90_3(REAL y[],REAL yr[]) - { - yr[ 0] = fx_3_001*y[ 3]+fx_3_002*y[ 5]; - yr[ 1] = fx_3_003*y[ 1]; - yr[ 2] = fx_3_004*y[ 3]+fx_3_001*y[ 5]; - yr[ 3] = fx_3_008*y[ 0]+fx_3_002*y[ 2]; - yr[ 4] = fx_3_005*y[ 4]+fx_3_006*y[ 6]; - yr[ 5] = fx_3_004*y[ 0]-fx_3_001*y[ 2]; - yr[ 6] = fx_3_006*y[ 4]+fx_3_007*y[ 6]; - }; - - static inline void sh_rotx90_inv_3(REAL y[],REAL yr[]) - { - yr[ 0] = fx_3_008*y[ 3]+fx_3_004*y[ 5]; - yr[ 1] = fx_3_003*y[ 1]; - yr[ 2] = fx_3_002*y[ 3]-fx_3_001*y[ 5]; - yr[ 3] = fx_3_001*y[ 0]+fx_3_004*y[ 2]; - yr[ 4] = fx_3_005*y[ 4]+fx_3_006*y[ 6]; - yr[ 5] = fx_3_002*y[ 0]+fx_3_001*y[ 2]; - yr[ 6] = fx_3_006*y[ 4]+fx_3_007*y[ 6]; - } - - static REAL fx_4_001 = (REAL) (-sqrt(56.0)/8.0); // -0.935414346694 - static REAL fx_4_002 = (REAL) ( sqrt(8.0)/8.0); // 0.353553390593 - static REAL fx_4_003 = (REAL) (-sqrt(36.0)/8.0); // -0.75 - static REAL fx_4_004 = (REAL) ( sqrt(28.0)/8.0); // 0.661437827766 - static REAL fx_4_005 = (REAL) (-sqrt(8.0)/8.0); // -0.353553390593 - static REAL fx_4_006 = (REAL) ( sqrt(36.0)/8.0); // 0.749999999999 - static REAL fx_4_007 = (REAL) ( sqrt(9.0)/8.0); // 0.37500034698 - static REAL fx_4_008 = (REAL) ( sqrt(20.0)/8.0); // 0.559017511622 - static REAL fx_4_009 = (REAL) ( sqrt(35.0)/8.0); // 0.739510657141 - static REAL fx_4_010 = (REAL) ( sqrt(16.0)/8.0); // 0.5 - static REAL fx_4_011 = (REAL) (-sqrt(28.0)/8.0); // -0.661437827766 - static REAL fx_4_012 = (REAL) ( sqrt(1.0)/8.0); // 0.125 - static REAL fx_4_013 = (REAL) ( sqrt(56.0)/8.0); // 0.935414346692 - - static inline void sh_rotx90_4(REAL y[],REAL yr[]) - { - yr[ 0] = fx_4_001*y[ 5]+fx_4_002*y[ 7]; - yr[ 1] = fx_4_003*y[ 1]+fx_4_004*y[ 3]; - yr[ 2] = fx_4_005*y[ 5]+fx_4_001*y[ 7]; - yr[ 3] = fx_4_004*y[ 1]+fx_4_006*y[ 3]; - yr[ 4] = fx_4_007*y[ 4]+fx_4_008*y[ 6]+fx_4_009*y[ 8]; - yr[ 5] = fx_4_013*y[ 0]+fx_4_002*y[ 2]; - yr[ 6] = fx_4_008*y[ 4]+fx_4_010*y[ 6]+fx_4_011*y[ 8]; - yr[ 7] = fx_4_005*y[ 0]-fx_4_001*y[ 2]; - yr[ 8] = fx_4_009*y[ 4]+fx_4_011*y[ 6]+fx_4_012*y[ 8]; - }; - - static inline void sh_rotx90_inv_4(REAL y[],REAL yr[]) - { - yr[ 0] = fx_4_013*y[ 5]+fx_4_005*y[ 7]; - yr[ 1] = fx_4_003*y[ 1]+fx_4_004*y[ 3]; - yr[ 2] = fx_4_002*y[ 5]-fx_4_001*y[ 7]; - yr[ 3] = fx_4_004*y[ 1]+fx_4_006*y[ 3]; - yr[ 4] = fx_4_007*y[ 4]+fx_4_008*y[ 6]+fx_4_009*y[ 8]; - yr[ 5] = fx_4_001*y[ 0]+fx_4_005*y[ 2]; - yr[ 6] = fx_4_008*y[ 4]+fx_4_010*y[ 6]+fx_4_011*y[ 8]; - yr[ 7] = fx_4_002*y[ 0]+fx_4_001*y[ 2]; - yr[ 8] = fx_4_009*y[ 4]+fx_4_011*y[ 6]+fx_4_012*y[ 8]; - } - - static REAL fx_5_001 = (REAL) ( sqrt(126.0)/16.0); // 0.70156076002 - static REAL fx_5_002 = (REAL) (-sqrt(120.0)/16.0); // -0.684653196882 - static REAL fx_5_003 = (REAL) ( sqrt(10.0)/16.0); // 0.197642353761 - static REAL fx_5_004 = (REAL) (-sqrt(64.0)/16.0); // -0.5 - static REAL fx_5_005 = (REAL) ( sqrt(192.0)/16.0); // 0.866025403784 - static REAL fx_5_006 = (REAL) ( sqrt(70.0)/16.0); // 0.522912516584 - static REAL fx_5_007 = (REAL) ( sqrt(24.0)/16.0); // 0.306186217848 - static REAL fx_5_008 = (REAL) (-sqrt(162.0)/16.0); // -0.795495128835 - static REAL fx_5_009 = (REAL) ( sqrt(64.0)/16.0); // 0.5 - static REAL fx_5_010 = (REAL) ( sqrt(60.0)/16.0); // 0.484122918274 - static REAL fx_5_011 = (REAL) ( sqrt(112.0)/16.0); // 0.661437827763 - static REAL fx_5_012 = (REAL) ( sqrt(84.0)/16.0); // 0.572821961867 - static REAL fx_5_013 = (REAL) ( sqrt(4.0)/16.0); // 0.125 - static REAL fx_5_014 = (REAL) ( sqrt(42.0)/16.0); // 0.405046293649 - static REAL fx_5_015 = (REAL) ( sqrt(210.0)/16.0); // 0.905711046633 - static REAL fx_5_016 = (REAL) ( sqrt(169.0)/16.0); // 0.8125 - static REAL fx_5_017 = (REAL) (-sqrt(45.0)/16.0); // -0.419262745781 - static REAL fx_5_018 = (REAL) ( sqrt(1.0)/16.0); // 0.0625 - static REAL fx_5_019 = (REAL) (-sqrt(126.0)/16.0); // -0.701561553415 - static REAL fx_5_020 = (REAL) ( sqrt(120.0)/16.0); // 0.684653196881 - static REAL fx_5_021 = (REAL) (-sqrt(10.0)/16.0); // -0.197642353761 - static REAL fx_5_022 = (REAL) (-sqrt(70.0)/16.0); // -0.522913107945 - static REAL fx_5_023 = (REAL) (-sqrt(60.0)/16.0); // -0.48412346577 - - static inline void sh_rotx90_5(REAL y[],REAL yr[]) - { - yr[ 0] = fx_5_001*y[ 5]+fx_5_002*y[ 7]+fx_5_003*y[ 9]; - yr[ 1] = fx_5_004*y[ 1]+fx_5_005*y[ 3]; - yr[ 2] = fx_5_006*y[ 5]+fx_5_007*y[ 7]+fx_5_008*y[ 9]; - yr[ 3] = fx_5_005*y[ 1]+fx_5_009*y[ 3]; - yr[ 4] = fx_5_010*y[ 5]+fx_5_011*y[ 7]+fx_5_012*y[ 9]; - yr[ 5] = fx_5_019*y[ 0]+fx_5_022*y[ 2]+fx_5_023*y[ 4]; - yr[ 6] = fx_5_013*y[ 6]+fx_5_014*y[ 8]+fx_5_015*y[ 10]; - yr[ 7] = fx_5_020*y[ 0]-fx_5_007*y[ 2]-fx_5_011*y[ 4]; - yr[ 8] = fx_5_014*y[ 6]+fx_5_016*y[ 8]+fx_5_017*y[ 10]; - yr[ 9] = fx_5_021*y[ 0]-fx_5_008*y[ 2]-fx_5_012*y[ 4]; - yr[ 10] = fx_5_015*y[ 6]+fx_5_017*y[ 8]+fx_5_018*y[ 10]; - }; - - static inline void sh_rotx90_inv_5(REAL y[],REAL yr[]) - { - yr[ 0] = fx_5_019*y[ 5]+fx_5_020*y[ 7]+fx_5_021*y[ 9]; - yr[ 1] = fx_5_004*y[ 1]+fx_5_005*y[ 3]; - yr[ 2] = fx_5_022*y[ 5]-fx_5_007*y[ 7]-fx_5_008*y[ 9]; - yr[ 3] = fx_5_005*y[ 1]+fx_5_009*y[ 3]; - yr[ 4] = fx_5_023*y[ 5]-fx_5_011*y[ 7]-fx_5_012*y[ 9]; - yr[ 5] = fx_5_001*y[ 0]+fx_5_006*y[ 2]+fx_5_010*y[ 4]; - yr[ 6] = fx_5_013*y[ 6]+fx_5_014*y[ 8]+fx_5_015*y[ 10]; - yr[ 7] = fx_5_002*y[ 0]+fx_5_007*y[ 2]+fx_5_011*y[ 4]; - yr[ 8] = fx_5_014*y[ 6]+fx_5_016*y[ 8]+fx_5_017*y[ 10]; - yr[ 9] = fx_5_003*y[ 0]+fx_5_008*y[ 2]+fx_5_012*y[ 4]; - yr[ 10] = fx_5_015*y[ 6]+fx_5_017*y[ 8]+fx_5_018*y[ 10]; - } - - static inline void sh_rot_1(REAL m[3*3],REAL y[NL1],REAL yr[NL1]) - { - REAL yr0 = m[4]*y[0] - m[5]*y[1] + m[3]*y[2]; - REAL yr1 = m[8]*y[1] - m[7]*y[0] - m[6]*y[2]; - REAL yr2 = m[1]*y[0] - m[2]*y[1] + m[0]*y[2]; - - yr[0] = yr0; - yr[1] = yr1; - yr[2] = yr2; - } - - static inline void sh_roty_1(REAL ctm[1],REAL stm[1],REAL y[NL1],REAL yr[NL1]) - { - yr[0] = y[0]; - rot_inv(ctm[0],stm[0],y[1],y[2],yr[1],yr[2]); - } - - static inline void sh_roty_2(REAL ctm[2],REAL stm[2],REAL y[NL2],REAL yr[NL2]) - { - REAL ytmp[NL2]; - sh_rotx90_2(y,yr); - sh_rotz_2(ctm,stm,yr,ytmp); - sh_rotx90_inv_2(ytmp,yr); - } - - static inline void sh_roty_3(REAL ctm[3],REAL stm[3],REAL y[NL3],REAL yr[NL3]) - { - REAL ytmp[NL3]; - sh_rotx90_3(y,yr); - sh_rotz_3(ctm,stm,yr,ytmp); - sh_rotx90_inv_3(ytmp,yr); - } - - static inline void sh_roty_4(REAL ctm[4],REAL stm[4],REAL y[NL4],REAL yr[NL4]) - { - REAL ytmp[NL4]; - sh_rotx90_4(y,yr); - sh_rotz_4(ctm,stm,yr,ytmp); - sh_rotx90_inv_4(ytmp,yr); - } - - static inline void sh_roty_5(REAL ctm[5],REAL stm[5],REAL y[NL5],REAL yr[NL5]) - { - REAL ytmp[NL5]; - sh_rotx90_5(y,yr); - sh_rotz_5(ctm,stm,yr,ytmp); - sh_rotx90_inv_5(ytmp,yr); - } - - #define ROT_TOL CONSTANT(1e-4) - - /* - Finds cosine,sine pairs for zyz rotation (i.e. rotation R_z2 R_y R_z1 v). - The rotation is one which maps mx to (1,0,0) and mz to (0,0,1). - */ - static inline void zyz(REAL m[3*3],REAL &zc1,REAL &zs1,REAL &yc,REAL &ys,REAL &zc2,REAL &zs2) - { - REAL cz = m[8]; - - // rotate so that (cx,cy,0) aligns to (1,0,0) - REAL cxylen = (REAL) sqrtf(1.0f - cz*cz); - if (cxylen >= ROT_TOL) - { - // if above is a NaN, will do the correct thing - yc = cz; - ys = cxylen; - REAL len67inv = 1.0f/sqrtf(m[6]*m[6] + m[7]*m[7]); - zc1 = -m[6]*len67inv; - zs1 = m[7]*len67inv; - REAL len25inv = 1.0f/sqrtf(m[2]*m[2] + m[5]*m[5]); - zc2 = m[2]*len25inv; - zs2 = m[5]*len25inv; - } else { // m[6],m[7],m[8] already aligned to (0,0,1) - zc1 = 1.0; zs1 = 0.0; // identity - yc = cz; ys = 0.0; // identity - zc2 = m[0]*cz; zs2 = -m[1]; // align x axis (mx[0],mx[1],0) to (1,0,0) - } - } - - static inline void sh_rotzyz_2(REAL zc1m[2],REAL zs1m[2],REAL ycm[2],REAL ysm[2],REAL zc2m[2],REAL zs2m[2],REAL y[NL2],REAL yr[NL2]) - { - REAL ytmp[NL2]; - sh_rotz_2(zc1m,zs1m,y,yr); - sh_roty_2(ycm,ysm,yr,ytmp); - sh_rotz_2(zc2m,zs2m,ytmp,yr); - } - - static inline void sh_rotzyz_3(REAL zc1m[3],REAL zs1m[3],REAL ycm[3],REAL ysm[3],REAL zc2m[3],REAL zs2m[3],REAL y[NL3],REAL yr[NL3]) - { - REAL ytmp[NL3]; - sh_rotz_3(zc1m,zs1m,y,yr); - sh_roty_3(ycm,ysm,yr,ytmp); - sh_rotz_3(zc2m,zs2m,ytmp,yr); - } - - static inline void sh_rotzyz_4(REAL zc1m[4],REAL zs1m[4],REAL ycm[4],REAL ysm[4],REAL zc2m[4],REAL zs2m[4],REAL y[NL4],REAL yr[NL4]) - { - REAL ytmp[NL4]; - sh_rotz_4(zc1m,zs1m,y,yr); - sh_roty_4(ycm,ysm,yr,ytmp); - sh_rotz_4(zc2m,zs2m,ytmp,yr); - } - - static inline void sh_rotzyz_5(REAL zc1m[5],REAL zs1m[5],REAL ycm[5],REAL ysm[5],REAL zc2m[5],REAL zs2m[5],REAL y[NL5],REAL yr[NL5]) - { - REAL ytmp[NL5]; - sh_rotz_5(zc1m,zs1m,y,yr); - sh_roty_5(ycm,ysm,yr,ytmp); - sh_rotz_5(zc2m,zs2m,ytmp,yr); - } - - static inline void sh3_rot(REAL m[3*3],REAL zc1,REAL zs1,REAL yc,REAL ys,REAL zc2,REAL zs2,REAL y[NSH3],REAL yr[NSH3]) - { - REAL zc1m[3],zs1m[3]; - rot_3(zc1,zs1,zc1m,zs1m); - REAL ycm[3],ysm[3]; - rot_3(yc,ys,ycm,ysm); - REAL zc2m[3],zs2m[3]; - rot_3(zc2,zs2,zc2m,zs2m); - - yr[0] = y[0]; - sh_rot_1(m,y+NSH0,yr+NSH0); - sh_rotzyz_2(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH1,yr+NSH1); - sh_rotzyz_3(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH2,yr+NSH2); - } - - static inline void sh4_rot(REAL m[3*3],REAL zc1,REAL zs1,REAL yc,REAL ys,REAL zc2,REAL zs2,REAL y[NSH4],REAL yr[NSH4]) - { - REAL zc1m[4],zs1m[4]; - rot_4(zc1,zs1,zc1m,zs1m); - REAL ycm[4],ysm[4]; - rot_4(yc,ys,ycm,ysm); - REAL zc2m[4],zs2m[4]; - rot_4(zc2,zs2,zc2m,zs2m); - - yr[0] = y[0]; - sh_rot_1(m,y+NSH0,yr+NSH0); - sh_rotzyz_2(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH1,yr+NSH1); - sh_rotzyz_3(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH2,yr+NSH2); - sh_rotzyz_4(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH3,yr+NSH3); - } - - static inline void sh5_rot(REAL m[3*3],REAL zc1,REAL zs1,REAL yc,REAL ys,REAL zc2,REAL zs2,REAL y[NSH5],REAL yr[NSH5]) - { - REAL zc1m[5],zs1m[5]; - rot_5(zc1,zs1,zc1m,zs1m); - REAL ycm[5],ysm[5]; - rot_5(yc,ys,ycm,ysm); - REAL zc2m[5],zs2m[5]; - rot_5(zc2,zs2,zc2m,zs2m); - - yr[0] = y[0]; - sh_rot_1(m,y+NSH0,yr+NSH0); - sh_rotzyz_2(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH1,yr+NSH1); - sh_rotzyz_3(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH2,yr+NSH2); - sh_rotzyz_4(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH3,yr+NSH3); - sh_rotzyz_5(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH4,yr+NSH4); - } - - inline void sh1_rot(REAL m[3*3],REAL y[NSH1],REAL yr[NSH1]) - { - yr[0] = y[0]; - sh_rot_1(m,y+NSH0,yr+NSH0); - } - - inline void sh3_rot(REAL m[3*3],REAL y[NSH3],REAL yr[NSH3]) - { - REAL zc1,zs1,yc,ys,zc2,zs2; - zyz(m,zc1,zs1,yc,ys,zc2,zs2); - sh3_rot(m,zc1,zs1,yc,ys,zc2,zs2,y,yr); - } - - inline void sh4_rot(REAL m[3*3],REAL y[NSH4],REAL yr[NSH4]) - { - REAL zc1,zs1,yc,ys,zc2,zs2; - zyz(m,zc1,zs1,yc,ys,zc2,zs2); - sh4_rot(m,zc1,zs1,yc,ys,zc2,zs2,y,yr); - } - - inline void sh5_rot(REAL m[3*3],REAL y[NSH5],REAL yr[NSH5]) - { - REAL zc1,zs1,yc,ys,zc2,zs2; - zyz(m,zc1,zs1,yc,ys,zc2,zs2); - sh5_rot(m,zc1,zs1,yc,ys,zc2,zs2,y,yr); - } - - // simple matrix vector multiply for a square matrix (only used by ZRotation) - static inline void SimpMatMul(size_t dim, const float *matrix, const float *input, float *result) - { - for(size_t iR=0; iR < dim; ++iR) - { - result[iR + 0] = matrix[iR*dim + 0] * input[0]; - for(size_t iC=1; iC < dim; ++iC) - { - result[iR] += matrix[iR*dim+ iC] * input[iC]; - } - } - } - -}; // anonymous namespace - - -namespace DirectX -{ - -//------------------------------------------------------------------------------------- -// Evaluates the Spherical Harmonic basis functions -// -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205448.aspx -//------------------------------------------------------------------------------------- -float* XM_CALLCONV XMSHEvalDirection( _Out_writes_(order*order) float *result, - _In_ size_t order, - _In_ FXMVECTOR dir ) -{ - if ( !result ) - return nullptr; - - XMFLOAT4A dv; - XMStoreFloat4A( &dv, dir ); - - const float fX = dv.x; - const float fY = dv.y; - const float fZ = dv.z; - - switch( order ) - { - case 2: - sh_eval_basis_1(fX,fY,fZ,result); - break; - - case 3: - sh_eval_basis_2(fX,fY,fZ,result); - break; - - case 4: - sh_eval_basis_3(fX,fY,fZ,result); - break; - - case 5: - sh_eval_basis_4(fX,fY,fZ,result); - break; - - case 6: - sh_eval_basis_5(fX,fY,fZ,result); - break; - - default: - assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ); - return nullptr; - } - - return result; -} - - -//------------------------------------------------------------------------------------- -// Rotates SH vector by a rotation matrix -// -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204992.aspx -//------------------------------------------------------------------------------------- -float* XM_CALLCONV XMSHRotate( _Out_writes_(order*order) float *result, - _In_ size_t order, - _In_ FXMMATRIX rotMatrix, - _In_reads_(order*order) const float *input ) -{ - if ( !result || !input ) - return nullptr; - - if( result == input ) - return nullptr; - - XMFLOAT3X3 mat; - XMStoreFloat3x3( &mat, rotMatrix ); - - float mRot[3*3]; - const float r00 = mRot[0*3 +0] = mat._11; - const float r10 = mRot[1*3 +0] = mat._12; - const float r20 = mRot[2*3 +0] = mat._13; - - const float r01 = mRot[0*3 +1] = mat._21; - const float r11 = mRot[1*3 +1] = mat._22; - const float r21 = mRot[2*3 +1] = mat._23; - - const float r02 = mRot[0*3 +2] = mat._31; - const float r12 = mRot[1*3 +2] = mat._32; - const float r22 = mRot[2*3 +2] = mat._33; - - result[0] = input[0]; // rotate the constant term - - switch (order) - { - case 2: - { - // do linear by hand... - - result[1] = r11*input[1] - r12*input[2] + r10*input[3]; - result[2] = -r21*input[1] + r22*input[2] - r20*input[3]; - result[3] = r01*input[1] -r02*input[2] + r00*input[3]; - } - break; - - case 3: - { - float R[25]; - // do linear by hand... - - result[1] = r11*input[1] - r12*input[2] + r10*input[3]; - result[2] = -r21*input[1] + r22*input[2] - r20*input[3]; - result[3] = r01*input[1] -r02*input[2] + r00*input[3]; - - // direct code for quadratics is faster than ZYZ reccurence relations - - const float t41 = r01 * r00; - const float t43 = r11 * r10; - const float t48 = r11 * r12; - const float t50 = r01 * r02; - const float t55 = r02 * r02; - const float t57 = r22 * r22; - const float t58 = r12 * r12; - const float t61 = r00 * r02; - const float t63 = r10 * r12; - const float t68 = r10 * r10; - const float t70 = r01 * r01; - const float t72 = r11 * r11; - const float t74 = r00 * r00; - const float t76 = r21 * r21; - const float t78 = r20 * r20; - - const float v173 = 0.1732050808e1f; - const float v577 = 0.5773502693e0f; - const float v115 = 0.1154700539e1f; - const float v288 = 0.2886751347e0f; - const float v866 = 0.8660254040e0f; - - R[0] = r11 * r00 + r01 * r10; - R[1] = - r01 * r12 - r11 * r02; - R[2] = v173 * r02 * r12; - R[3] = - r10 * r02 - r00 * r12; - R[4] = r00 * r10 - r01 * r11; - R[5] = - r11 * r20 - r21 * r10; - R[6] = r11 * r22 + r21 * r12; - R[7] = -v173 * r22 * r12; - R[8] = r20 * r12 + r10 * r22; - R[9] = - r10 * r20 + r11 * r21; - R[10] = - v577* (t41 + t43) + v115 * r21 * r20; - R[11] = v577* (t48 + t50) - v115 * r21 * r22; - R[12] = -0.5000000000e0f * (t55 + t58) + t57; - R[13] = v577 * (t61 + t63) - v115 * r20 * r22; - R[14] = v288 * (t70 - t68 + t72 - t74) - v577 * (t76 - t78); - R[15] = - r01 * r20 - r21 * r00; - R[16] = r01 * r22 + r21 * r02; - R[17] = -v173 * r22 * r02; - R[18] = r00 * r22 + r20 * r02; - R[19] = - r00 * r20 + r01 * r21; - R[20] = t41 - t43; - R[21] = - t50 + t48; - R[22] = v866 * (t55 - t58); - R[23] = t63 - t61; - R[24] = 0.5000000000e0f *( t74 - t68 - t70 + t72); - - // blow the matrix multiply out by hand, looping is ineficient on a P4... - for(unsigned int iR=0; iR<5;iR++) - { - const unsigned int uBase = iR*5; - result[4 + iR] = R[uBase + 0]*input[4] + R[uBase + 1]*input[5] + R[uBase + 2]*input[6] + R[uBase + 3]*input[7] + R[uBase + 4]*input[8]; - } - } - break; - - case 4: - sh3_rot(mRot,const_cast(input),result); - break; - - case 5: - sh4_rot(mRot,const_cast(input),result); - break; - - case 6: - sh5_rot(mRot,const_cast(input),result); - break; - - default: - assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ); - return nullptr; - } - - return result; -} - - -//------------------------------------------------------------------------------------- -// Rotates the SH vector in the Z axis by an angle -// -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205461.aspx -//------------------------------------------------------------------------------------- -float* XMSHRotateZ( _Out_writes_(order*order) float *result, - _In_ size_t order, - _In_ float angle, - _In_reads_(order*order) const float *input ) -{ - if ( !result || !input ) - return nullptr; - - if( result == input ) - return nullptr; - - if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ) - return nullptr; - - float R[(2*(XM_SH_MAXORDER-1) + 1)*(2* (XM_SH_MAXORDER-1) + 1)]; // used to store rotation matrices... - - // these are actually very sparse matrices, most of the entries are zero's... - - const float ca = cosf(angle); - const float sa = sinf(angle); - - const float t1 = ca; - const float t2 = sa; - R[0] = t1; - R[1] = 0.0f; - R[2] = t2; - R[3] = 0.0f; - R[4] = 1.0f; - R[5] = 0.0f; - R[6] = -t2; - R[7] = 0.0f; - R[8] = t1; - - result[0] = input[0]; - SimpMatMul(3,R,input+1,result+1); - - if (order > 2) - { - for(int j=0;j<5*5;j++) R[j] = 0.0f; - const float t1 = sa; - const float t2 = t1*t1; - const float t3 = ca; - const float t4 = t3*t3; - const float t5 = -t2+t4; - const float t7 = 2.0f*t3*t1; - R[0] = t5; - R[4] = t7; - R[6] = t3; - R[8] = t1; - R[12] = 1.0f; - R[16] = -t1; - R[18] = t3; - R[20] = -t7; - R[24] = t5; - - SimpMatMul(5,R,input+4,result+4); // un-roll matrix/vector multiply - if (order > 3) - { - for(int j=0;j<7*7;j++) R[j] = 0.0f; - const float t1 = ca; - const float t2 = t1*t1; - const float t4 = sa; - const float t5 = t4*t4; - const float t8 = t2*t1-3.0f*t1*t5; - const float t12 = 3.0f*t4*t2-t5*t4; - const float t13 = -t5+t2; - const float t15 = 2.0f*t1*t4; - R[0] = t8; - R[6] = t12; - R[8] = t13; - R[12] = t15; - R[16] = t1; - R[18] = t4; - R[24] = 1.0f; - R[30] = -t4; - R[32] = t1; - R[36] = -t15; - R[40] = t13; - R[42] = -t12; - R[48] = t8; - SimpMatMul(7,R,input+9,result+9); - if (order > 4) - { - for(int j=0;j<=9*9;j++) R[j] = 0.0f; - const float t1 = ca; - const float t2 = t1*t1; - const float t3 = t2*t2; - const float t4 = sa; - const float t5 = t4*t4; - const float t6 = t5*t5; - const float t9 = t3+t6-6.0f*t5*t2; - const float t10 = t5*t4; - const float t12 = t2*t1; - const float t14 = -t10*t1+t4*t12; - const float t17 = t12-3.0f*t1*t5; - const float t20 = 3.0f*t4*t2-t10; - const float t21 = -t5+t2; - const float t23 = 2.0f*t1*t4; - R[0] = t9; - R[8] = 4.0f*t14; - R[10] = t17; - R[16] = t20; - R[20] = t21; - R[24] = t23; - R[30] = t1; - R[32] = t4; - R[40] = 1.0f; - R[48] = -t4; - R[50] = t1; - R[56] = -t23; - R[60] = t21; - R[64] = -t20; - R[70] = t17; - R[72] = -4.0f*t14; - R[80] = t9; - - SimpMatMul(9,R,input+16,result+16); - if (order > 5) - { - for(int j=0;j<11*11;j++) R[j] = 0.0f; - const float t1 = ca; - const float t2 = sa; - const float t3 = t2*t2; - const float t4 = t3*t3; - const float t7 = t1*t1; - const float t8 = t7*t1; - const float t11 = t7*t7; - const float t13 = 5.0f*t1*t4-10.0f*t3*t8+t11*t1; - const float t14 = t3*t2; - const float t20 = -10.0f*t14*t7+5.0f*t2*t11+t4*t2; - const float t23 = t11+t4-6.0f*t3*t7; - const float t26 = -t14*t1+t2*t8; - const float t29 = t8-3.0f*t1*t3; - const float t32 = 3.0f*t2*t7-t14; - const float t33 = -t3+t7; - const float t35 = 2.0f*t1*t2; - R[0] = t13; - R[10] = t20; - R[12] = t23; - R[20] = 4.0f*t26; - R[24] = t29; - R[30] = t32; - R[36] = t33; - R[40] = t35; - R[48] = t1; - R[50] = t2; - R[60] = 1.0f; - R[70] = -t2; - R[72] = t1; - R[80] = -t35; - R[84] = t33; - R[90] = -t32; - R[96] = t29; - R[100] = -4.0f*t26; - R[108] = t23; - R[110] = -t20; - R[120] = t13; - SimpMatMul(11,R,input+25,result+25); - } - } - } - } - - return result; -} - - -//------------------------------------------------------------------------------------- -// Adds two SH vectors, result[i] = inputA[i] + inputB[i]; -// -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205438.aspx -//------------------------------------------------------------------------------------- -float* XMSHAdd( _Out_writes_(order*order) float *result, - _In_ size_t order, - _In_reads_(order*order) const float *inputA, - _In_reads_(order*order) const float *inputB ) -{ - if ( !result || !inputA || !inputB ) - return nullptr; - - const size_t numcoeff = order*order; - - for( size_t i=0; i < numcoeff; ++i ) - { - result[i] = inputA[i] + inputB[i]; - } - - return result; -} - - -//------------------------------------------------------------------------------------- -// Scales a SH vector, result[i] = input[i] * scale; -// -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204994.aspx -//------------------------------------------------------------------------------------- -float* XMSHScale( _Out_writes_(order*order) float *result, - _In_ size_t order, - _In_reads_(order*order) const float *input, - _In_ float scale ) -{ - if ( !result || !input ) - return nullptr; - - const size_t numcoeff = order*order; - - for( size_t i=0; i < numcoeff; ++i ) - { - result[i] = scale * input[i]; - } - - return result; -} - - -//------------------------------------------------------------------------------------- -// Computes the dot product of two SH vectors -// -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205446.aspx -//------------------------------------------------------------------------------------- -float XMSHDot( _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB ) -{ - if ( !inputA || !inputB ) - return 0.f; - - float result = inputA[0] * inputB[0]; - - const size_t numcoeff = order*order; - - for( size_t i=1; i < numcoeff; ++i ) - { - result += inputA[i] * inputB[i]; - } - - return result; -} - - -//------------------------------------------------------------------------------------- -// Computes the product of two functions represented using SH (f and g), where: -// result[i] = int(y_i(s) * f(s) * g(s)), where y_i(s) is the ith SH basis -// function, f(s) and g(s) are SH functions (sum_i(y_i(s)*c_i)). The order O -// determines the lengths of the arrays, where there should always be O^2 -// coefficients. In general the product of two SH functions of order O generates -// and SH function of order 2*O - 1, but we truncate the result. This means -// that the product commutes (f*g == g*f) but doesn't associate -// (f*(g*h) != (f*g)*h. -//------------------------------------------------------------------------------------- -float* XMSHMultiply( _Out_writes_(order*order) float *result, - _In_ size_t order, - _In_reads_(order*order) const float *inputF, - _In_reads_(order*order) const float *inputG ) -{ - switch( order ) - { - case 2: - return XMSHMultiply2( result, inputF, inputG ); - - case 3: - return XMSHMultiply3( result, inputF, inputG ); - - case 4: - return XMSHMultiply4( result, inputF, inputG ); - - case 5: - return XMSHMultiply5( result, inputF, inputG ); - - case 6: - return XMSHMultiply6( result, inputF, inputG ); - - default: - assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ); - return nullptr; - } -} - - -//------------------------------------------------------------------------------------- -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205454.aspx -//------------------------------------------------------------------------------------- -float* XMSHMultiply2( _Out_writes_(4) float *y, - _In_reads_(4) const float *f, - _In_reads_(4) const float *g ) -{ - if ( !y || !f || !g ) - return nullptr; - - REAL tf,tg,t; - // [0,0]: 0, - y[0] = CONSTANT(0.282094792935999980)*f[0]*g[0]; - - // [1,1]: 0, - tf = CONSTANT(0.282094791773000010)*f[0]; - tg = CONSTANT(0.282094791773000010)*g[0]; - y[1] = tf*g[1]+tg*f[1]; - t = f[1]*g[1]; - y[0] += CONSTANT(0.282094791773000010)*t; - - // [2,2]: 0, - tf = CONSTANT(0.282094795249000000)*f[0]; - tg = CONSTANT(0.282094795249000000)*g[0]; - y[2] = tf*g[2]+tg*f[2]; - t = f[2]*g[2]; - y[0] += CONSTANT(0.282094795249000000)*t; - - // [3,3]: 0, - tf = CONSTANT(0.282094791773000010)*f[0]; - tg = CONSTANT(0.282094791773000010)*g[0]; - y[3] = tf*g[3]+tg*f[3]; - t = f[3]*g[3]; - y[0] += CONSTANT(0.282094791773000010)*t; - - // multiply count=20 - - return y; -} - - -//------------------------------------------------------------------------------------- -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232906.aspx -//------------------------------------------------------------------------------------- -float* XMSHMultiply3( _Out_writes_(9) float *y, - _In_reads_(9) const float *f, - _In_reads_(9) const float *g ) -{ - if ( !y || !f || !g ) - return nullptr; - - REAL tf,tg,t; - // [0,0]: 0, - y[0] = CONSTANT(0.282094792935999980)*f[0]*g[0]; - - // [1,1]: 0,6,8, - tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8]; - tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8]; - y[1] = tf*g[1]+tg*f[1]; - t = f[1]*g[1]; - y[0] += CONSTANT(0.282094791773000010)*t; - y[6] = CONSTANT(-0.126156626101000010)*t; - y[8] = CONSTANT(-0.218509686119999990)*t; - - // [1,2]: 5, - tf = CONSTANT(0.218509686118000010)*f[5]; - tg = CONSTANT(0.218509686118000010)*g[5]; - y[1] += tf*g[2]+tg*f[2]; - y[2] = tf*g[1]+tg*f[1]; - t = f[1]*g[2]+f[2]*g[1]; - y[5] = CONSTANT(0.218509686118000010)*t; - - // [1,3]: 4, - tf = CONSTANT(0.218509686114999990)*f[4]; - tg = CONSTANT(0.218509686114999990)*g[4]; - y[1] += tf*g[3]+tg*f[3]; - y[3] = tf*g[1]+tg*f[1]; - t = f[1]*g[3]+f[3]*g[1]; - y[4] = CONSTANT(0.218509686114999990)*t; - - // [2,2]: 0,6, - tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6]; - tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6]; - y[2] += tf*g[2]+tg*f[2]; - t = f[2]*g[2]; - y[0] += CONSTANT(0.282094795249000000)*t; - y[6] += CONSTANT(0.252313259986999990)*t; - - // [2,3]: 7, - tf = CONSTANT(0.218509686118000010)*f[7]; - tg = CONSTANT(0.218509686118000010)*g[7]; - y[2] += tf*g[3]+tg*f[3]; - y[3] += tf*g[2]+tg*f[2]; - t = f[2]*g[3]+f[3]*g[2]; - y[7] = CONSTANT(0.218509686118000010)*t; - - // [3,3]: 0,6,8, - tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8]; - tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8]; - y[3] += tf*g[3]+tg*f[3]; - t = f[3]*g[3]; - y[0] += CONSTANT(0.282094791773000010)*t; - y[6] += CONSTANT(-0.126156626101000010)*t; - y[8] += CONSTANT(0.218509686119999990)*t; - - // [4,4]: 0,6, - tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]; - tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]; - y[4] += tf*g[4]+tg*f[4]; - t = f[4]*g[4]; - y[0] += CONSTANT(0.282094791770000020)*t; - y[6] += CONSTANT(-0.180223751576000010)*t; - - // [4,5]: 7, - tf = CONSTANT(0.156078347226000000)*f[7]; - tg = CONSTANT(0.156078347226000000)*g[7]; - y[4] += tf*g[5]+tg*f[5]; - y[5] += tf*g[4]+tg*f[4]; - t = f[4]*g[5]+f[5]*g[4]; - y[7] += CONSTANT(0.156078347226000000)*t; - - // [5,5]: 0,6,8, - tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8]; - tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8]; - y[5] += tf*g[5]+tg*f[5]; - t = f[5]*g[5]; - y[0] += CONSTANT(0.282094791773999990)*t; - y[6] += CONSTANT(0.090111875786499998)*t; - y[8] += CONSTANT(-0.156078347227999990)*t; - - // [6,6]: 0,6, - tf = CONSTANT(0.282094797560000000)*f[0]; - tg = CONSTANT(0.282094797560000000)*g[0]; - y[6] += tf*g[6]+tg*f[6]; - t = f[6]*g[6]; - y[0] += CONSTANT(0.282094797560000000)*t; - y[6] += CONSTANT(0.180223764527000010)*t; - - // [7,7]: 0,6,8, - tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.156078347227999990)*f[8]; - tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.156078347227999990)*g[8]; - y[7] += tf*g[7]+tg*f[7]; - t = f[7]*g[7]; - y[0] += CONSTANT(0.282094791773999990)*t; - y[6] += CONSTANT(0.090111875786499998)*t; - y[8] += CONSTANT(0.156078347227999990)*t; - - // [8,8]: 0,6, - tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]; - tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]; - y[8] += tf*g[8]+tg*f[8]; - t = f[8]*g[8]; - y[0] += CONSTANT(0.282094791770000020)*t; - y[6] += CONSTANT(-0.180223751576000010)*t; - - // multiply count=120 - - return y; -} - - -//------------------------------------------------------------------------------------- -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232907.aspx -//------------------------------------------------------------------------------------- -float* XMSHMultiply4( _Out_writes_(16) float *y, - _In_reads_(16) const float *f, - _In_reads_(16) const float *g ) -{ - if ( !y || !f || !g ) - return nullptr; - - REAL tf,tg,t; - // [0,0]: 0, - y[0] = CONSTANT(0.282094792935999980)*f[0]*g[0]; - - // [1,1]: 0,6,8, - tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8]; - tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8]; - y[1] = tf*g[1]+tg*f[1]; - t = f[1]*g[1]; - y[0] += CONSTANT(0.282094791773000010)*t; - y[6] = CONSTANT(-0.126156626101000010)*t; - y[8] = CONSTANT(-0.218509686119999990)*t; - - // [1,4]: 3,13,15, - tf = CONSTANT(0.218509686114999990)*f[3]+CONSTANT(-0.058399170082300000)*f[13]+CONSTANT(-0.226179013157999990)*f[15]; - tg = CONSTANT(0.218509686114999990)*g[3]+CONSTANT(-0.058399170082300000)*g[13]+CONSTANT(-0.226179013157999990)*g[15]; - y[1] += tf*g[4]+tg*f[4]; - y[4] = tf*g[1]+tg*f[1]; - t = f[1]*g[4]+f[4]*g[1]; - y[3] = CONSTANT(0.218509686114999990)*t; - y[13] = CONSTANT(-0.058399170082300000)*t; - y[15] = CONSTANT(-0.226179013157999990)*t; - - // [1,5]: 2,12,14, - tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(-0.184674390923000000)*f[14]; - tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(-0.184674390923000000)*g[14]; - y[1] += tf*g[5]+tg*f[5]; - y[5] = tf*g[1]+tg*f[1]; - t = f[1]*g[5]+f[5]*g[1]; - y[2] = CONSTANT(0.218509686118000010)*t; - y[12] = CONSTANT(-0.143048168103000000)*t; - y[14] = CONSTANT(-0.184674390923000000)*t; - - // [1,6]: 11, - tf = CONSTANT(0.202300659402999990)*f[11]; - tg = CONSTANT(0.202300659402999990)*g[11]; - y[1] += tf*g[6]+tg*f[6]; - y[6] += tf*g[1]+tg*f[1]; - t = f[1]*g[6]+f[6]*g[1]; - y[11] = CONSTANT(0.202300659402999990)*t; - - // [1,8]: 9,11, - tf = CONSTANT(0.226179013155000000)*f[9]+CONSTANT(0.058399170081799998)*f[11]; - tg = CONSTANT(0.226179013155000000)*g[9]+CONSTANT(0.058399170081799998)*g[11]; - y[1] += tf*g[8]+tg*f[8]; - y[8] += tf*g[1]+tg*f[1]; - t = f[1]*g[8]+f[8]*g[1]; - y[9] = CONSTANT(0.226179013155000000)*t; - y[11] += CONSTANT(0.058399170081799998)*t; - - // [2,2]: 0,6, - tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6]; - tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6]; - y[2] += tf*g[2]+tg*f[2]; - t = f[2]*g[2]; - y[0] += CONSTANT(0.282094795249000000)*t; - y[6] += CONSTANT(0.252313259986999990)*t; - - // [2,6]: 12, - tf = CONSTANT(0.247766706973999990)*f[12]; - tg = CONSTANT(0.247766706973999990)*g[12]; - y[2] += tf*g[6]+tg*f[6]; - y[6] += tf*g[2]+tg*f[2]; - t = f[2]*g[6]+f[6]*g[2]; - y[12] += CONSTANT(0.247766706973999990)*t; - - // [3,3]: 0,6,8, - tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8]; - tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8]; - y[3] += tf*g[3]+tg*f[3]; - t = f[3]*g[3]; - y[0] += CONSTANT(0.282094791773000010)*t; - y[6] += CONSTANT(-0.126156626101000010)*t; - y[8] += CONSTANT(0.218509686119999990)*t; - - // [3,6]: 13, - tf = CONSTANT(0.202300659402999990)*f[13]; - tg = CONSTANT(0.202300659402999990)*g[13]; - y[3] += tf*g[6]+tg*f[6]; - y[6] += tf*g[3]+tg*f[3]; - t = f[3]*g[6]+f[6]*g[3]; - y[13] += CONSTANT(0.202300659402999990)*t; - - // [3,7]: 2,12,14, - tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(0.184674390923000000)*f[14]; - tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(0.184674390923000000)*g[14]; - y[3] += tf*g[7]+tg*f[7]; - y[7] = tf*g[3]+tg*f[3]; - t = f[3]*g[7]+f[7]*g[3]; - y[2] += CONSTANT(0.218509686118000010)*t; - y[12] += CONSTANT(-0.143048168103000000)*t; - y[14] += CONSTANT(0.184674390923000000)*t; - - // [3,8]: 13,15, - tf = CONSTANT(-0.058399170081799998)*f[13]+CONSTANT(0.226179013155000000)*f[15]; - tg = CONSTANT(-0.058399170081799998)*g[13]+CONSTANT(0.226179013155000000)*g[15]; - y[3] += tf*g[8]+tg*f[8]; - y[8] += tf*g[3]+tg*f[3]; - t = f[3]*g[8]+f[8]*g[3]; - y[13] += CONSTANT(-0.058399170081799998)*t; - y[15] += CONSTANT(0.226179013155000000)*t; - - // [4,4]: 0,6, - tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]; - tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]; - y[4] += tf*g[4]+tg*f[4]; - t = f[4]*g[4]; - y[0] += CONSTANT(0.282094791770000020)*t; - y[6] += CONSTANT(-0.180223751576000010)*t; - - // [4,5]: 7, - tf = CONSTANT(0.156078347226000000)*f[7]; - tg = CONSTANT(0.156078347226000000)*g[7]; - y[4] += tf*g[5]+tg*f[5]; - y[5] += tf*g[4]+tg*f[4]; - t = f[4]*g[5]+f[5]*g[4]; - y[7] += CONSTANT(0.156078347226000000)*t; - - // [4,9]: 3,13, - tf = CONSTANT(0.226179013157999990)*f[3]+CONSTANT(-0.094031597258400004)*f[13]; - tg = CONSTANT(0.226179013157999990)*g[3]+CONSTANT(-0.094031597258400004)*g[13]; - y[4] += tf*g[9]+tg*f[9]; - y[9] += tf*g[4]+tg*f[4]; - t = f[4]*g[9]+f[9]*g[4]; - y[3] += CONSTANT(0.226179013157999990)*t; - y[13] += CONSTANT(-0.094031597258400004)*t; - - // [4,10]: 2,12, - tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12]; - tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12]; - y[4] += tf*g[10]+tg*f[10]; - y[10] = tf*g[4]+tg*f[4]; - t = f[4]*g[10]+f[10]*g[4]; - y[2] += CONSTANT(0.184674390919999990)*t; - y[12] += CONSTANT(-0.188063194517999990)*t; - - // [4,11]: 3,13,15, - tf = CONSTANT(-0.058399170082300000)*f[3]+CONSTANT(0.145673124078000010)*f[13]+CONSTANT(0.094031597258400004)*f[15]; - tg = CONSTANT(-0.058399170082300000)*g[3]+CONSTANT(0.145673124078000010)*g[13]+CONSTANT(0.094031597258400004)*g[15]; - y[4] += tf*g[11]+tg*f[11]; - y[11] += tf*g[4]+tg*f[4]; - t = f[4]*g[11]+f[11]*g[4]; - y[3] += CONSTANT(-0.058399170082300000)*t; - y[13] += CONSTANT(0.145673124078000010)*t; - y[15] += CONSTANT(0.094031597258400004)*t; - - // [5,5]: 0,6,8, - tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8]; - tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8]; - y[5] += tf*g[5]+tg*f[5]; - t = f[5]*g[5]; - y[0] += CONSTANT(0.282094791773999990)*t; - y[6] += CONSTANT(0.090111875786499998)*t; - y[8] += CONSTANT(-0.156078347227999990)*t; - - // [5,9]: 14, - tf = CONSTANT(0.148677009677999990)*f[14]; - tg = CONSTANT(0.148677009677999990)*g[14]; - y[5] += tf*g[9]+tg*f[9]; - y[9] += tf*g[5]+tg*f[5]; - t = f[5]*g[9]+f[9]*g[5]; - y[14] += CONSTANT(0.148677009677999990)*t; - - // [5,10]: 3,13,15, - tf = CONSTANT(0.184674390919999990)*f[3]+CONSTANT(0.115164716490000000)*f[13]+CONSTANT(-0.148677009678999990)*f[15]; - tg = CONSTANT(0.184674390919999990)*g[3]+CONSTANT(0.115164716490000000)*g[13]+CONSTANT(-0.148677009678999990)*g[15]; - y[5] += tf*g[10]+tg*f[10]; - y[10] += tf*g[5]+tg*f[5]; - t = f[5]*g[10]+f[10]*g[5]; - y[3] += CONSTANT(0.184674390919999990)*t; - y[13] += CONSTANT(0.115164716490000000)*t; - y[15] += CONSTANT(-0.148677009678999990)*t; - - // [5,11]: 2,12,14, - tf = CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.059470803871800003)*f[12]+CONSTANT(-0.115164716491000000)*f[14]; - tg = CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.059470803871800003)*g[12]+CONSTANT(-0.115164716491000000)*g[14]; - y[5] += tf*g[11]+tg*f[11]; - y[11] += tf*g[5]+tg*f[5]; - t = f[5]*g[11]+f[11]*g[5]; - y[2] += CONSTANT(0.233596680327000010)*t; - y[12] += CONSTANT(0.059470803871800003)*t; - y[14] += CONSTANT(-0.115164716491000000)*t; - - // [6,6]: 0,6, - tf = CONSTANT(0.282094797560000000)*f[0]; - tg = CONSTANT(0.282094797560000000)*g[0]; - y[6] += tf*g[6]+tg*f[6]; - t = f[6]*g[6]; - y[0] += CONSTANT(0.282094797560000000)*t; - y[6] += CONSTANT(0.180223764527000010)*t; - - // [7,7]: 6,0,8, - tf = CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.156078347227999990)*f[8]; - tg = CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.156078347227999990)*g[8]; - y[7] += tf*g[7]+tg*f[7]; - t = f[7]*g[7]; - y[6] += CONSTANT(0.090111875786499998)*t; - y[0] += CONSTANT(0.282094791773999990)*t; - y[8] += CONSTANT(0.156078347227999990)*t; - - // [7,10]: 9,1,11, - tf = CONSTANT(0.148677009678999990)*f[9]+CONSTANT(0.184674390919999990)*f[1]+CONSTANT(0.115164716490000000)*f[11]; - tg = CONSTANT(0.148677009678999990)*g[9]+CONSTANT(0.184674390919999990)*g[1]+CONSTANT(0.115164716490000000)*g[11]; - y[7] += tf*g[10]+tg*f[10]; - y[10] += tf*g[7]+tg*f[7]; - t = f[7]*g[10]+f[10]*g[7]; - y[9] += CONSTANT(0.148677009678999990)*t; - y[1] += CONSTANT(0.184674390919999990)*t; - y[11] += CONSTANT(0.115164716490000000)*t; - - // [7,13]: 12,2,14, - tf = CONSTANT(0.059470803871800003)*f[12]+CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.115164716491000000)*f[14]; - tg = CONSTANT(0.059470803871800003)*g[12]+CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.115164716491000000)*g[14]; - y[7] += tf*g[13]+tg*f[13]; - y[13] += tf*g[7]+tg*f[7]; - t = f[7]*g[13]+f[13]*g[7]; - y[12] += CONSTANT(0.059470803871800003)*t; - y[2] += CONSTANT(0.233596680327000010)*t; - y[14] += CONSTANT(0.115164716491000000)*t; - - // [7,14]: 15, - tf = CONSTANT(0.148677009677999990)*f[15]; - tg = CONSTANT(0.148677009677999990)*g[15]; - y[7] += tf*g[14]+tg*f[14]; - y[14] += tf*g[7]+tg*f[7]; - t = f[7]*g[14]+f[14]*g[7]; - y[15] += CONSTANT(0.148677009677999990)*t; - - // [8,8]: 0,6, - tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]; - tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]; - y[8] += tf*g[8]+tg*f[8]; - t = f[8]*g[8]; - y[0] += CONSTANT(0.282094791770000020)*t; - y[6] += CONSTANT(-0.180223751576000010)*t; - - // [8,9]: 11, - tf = CONSTANT(-0.094031597259499999)*f[11]; - tg = CONSTANT(-0.094031597259499999)*g[11]; - y[8] += tf*g[9]+tg*f[9]; - y[9] += tf*g[8]+tg*f[8]; - t = f[8]*g[9]+f[9]*g[8]; - y[11] += CONSTANT(-0.094031597259499999)*t; - - // [8,13]: 15, - tf = CONSTANT(-0.094031597259499999)*f[15]; - tg = CONSTANT(-0.094031597259499999)*g[15]; - y[8] += tf*g[13]+tg*f[13]; - y[13] += tf*g[8]+tg*f[8]; - t = f[8]*g[13]+f[13]*g[8]; - y[15] += CONSTANT(-0.094031597259499999)*t; - - // [8,14]: 2,12, - tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12]; - tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12]; - y[8] += tf*g[14]+tg*f[14]; - y[14] += tf*g[8]+tg*f[8]; - t = f[8]*g[14]+f[14]*g[8]; - y[2] += CONSTANT(0.184674390919999990)*t; - y[12] += CONSTANT(-0.188063194517999990)*t; - - // [9,9]: 6,0, - tf = CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.282094791766999970)*f[0]; - tg = CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.282094791766999970)*g[0]; - y[9] += tf*g[9]+tg*f[9]; - t = f[9]*g[9]; - y[6] += CONSTANT(-0.210261043508000010)*t; - y[0] += CONSTANT(0.282094791766999970)*t; - - // [10,10]: 0, - tf = CONSTANT(0.282094791771999980)*f[0]; - tg = CONSTANT(0.282094791771999980)*g[0]; - y[10] += tf*g[10]+tg*f[10]; - t = f[10]*g[10]; - y[0] += CONSTANT(0.282094791771999980)*t; - - // [11,11]: 0,6,8, - tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(-0.145673124078999990)*f[8]; - tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(-0.145673124078999990)*g[8]; - y[11] += tf*g[11]+tg*f[11]; - t = f[11]*g[11]; - y[0] += CONSTANT(0.282094791773999990)*t; - y[6] += CONSTANT(0.126156626101000010)*t; - y[8] += CONSTANT(-0.145673124078999990)*t; - - // [12,12]: 0,6, - tf = CONSTANT(0.282094799871999980)*f[0]+CONSTANT(0.168208852954000010)*f[6]; - tg = CONSTANT(0.282094799871999980)*g[0]+CONSTANT(0.168208852954000010)*g[6]; - y[12] += tf*g[12]+tg*f[12]; - t = f[12]*g[12]; - y[0] += CONSTANT(0.282094799871999980)*t; - y[6] += CONSTANT(0.168208852954000010)*t; - - // [13,13]: 0,8,6, - tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.145673124078999990)*f[8]+CONSTANT(0.126156626101000010)*f[6]; - tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.145673124078999990)*g[8]+CONSTANT(0.126156626101000010)*g[6]; - y[13] += tf*g[13]+tg*f[13]; - t = f[13]*g[13]; - y[0] += CONSTANT(0.282094791773999990)*t; - y[8] += CONSTANT(0.145673124078999990)*t; - y[6] += CONSTANT(0.126156626101000010)*t; - - // [14,14]: 0, - tf = CONSTANT(0.282094791771999980)*f[0]; - tg = CONSTANT(0.282094791771999980)*g[0]; - y[14] += tf*g[14]+tg*f[14]; - t = f[14]*g[14]; - y[0] += CONSTANT(0.282094791771999980)*t; - - // [15,15]: 0,6, - tf = CONSTANT(0.282094791766999970)*f[0]+CONSTANT(-0.210261043508000010)*f[6]; - tg = CONSTANT(0.282094791766999970)*g[0]+CONSTANT(-0.210261043508000010)*g[6]; - y[15] += tf*g[15]+tg*f[15]; - t = f[15]*g[15]; - y[0] += CONSTANT(0.282094791766999970)*t; - y[6] += CONSTANT(-0.210261043508000010)*t; - - // multiply count=399 - - return y; -} - - -//------------------------------------------------------------------------------------- -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232908.aspx -//------------------------------------------------------------------------------------- -float* XMSHMultiply5( _Out_writes_(25) float *y, - _In_reads_(25) const float *f, - _In_reads_(25) const float *g ) -{ - if ( !y || !f || !g ) - return nullptr; - - REAL tf,tg,t; - // [0,0]: 0, - y[0] = CONSTANT(0.282094792935999980)*f[0]*g[0]; - - // [1,1]: 0,6,8, - tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8]; - tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8]; - y[1] = tf*g[1]+tg*f[1]; - t = f[1]*g[1]; - y[0] += CONSTANT(0.282094791773000010)*t; - y[6] = CONSTANT(-0.126156626101000010)*t; - y[8] = CONSTANT(-0.218509686119999990)*t; - - // [1,4]: 3,13,15, - tf = CONSTANT(0.218509686114999990)*f[3]+CONSTANT(-0.058399170082300000)*f[13]+CONSTANT(-0.226179013157999990)*f[15]; - tg = CONSTANT(0.218509686114999990)*g[3]+CONSTANT(-0.058399170082300000)*g[13]+CONSTANT(-0.226179013157999990)*g[15]; - y[1] += tf*g[4]+tg*f[4]; - y[4] = tf*g[1]+tg*f[1]; - t = f[1]*g[4]+f[4]*g[1]; - y[3] = CONSTANT(0.218509686114999990)*t; - y[13] = CONSTANT(-0.058399170082300000)*t; - y[15] = CONSTANT(-0.226179013157999990)*t; - - // [1,5]: 2,12,14, - tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(-0.184674390923000000)*f[14]; - tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(-0.184674390923000000)*g[14]; - y[1] += tf*g[5]+tg*f[5]; - y[5] = tf*g[1]+tg*f[1]; - t = f[1]*g[5]+f[5]*g[1]; - y[2] = CONSTANT(0.218509686118000010)*t; - y[12] = CONSTANT(-0.143048168103000000)*t; - y[14] = CONSTANT(-0.184674390923000000)*t; - - // [1,9]: 8,22,24, - tf = CONSTANT(0.226179013155000000)*f[8]+CONSTANT(-0.043528171378199997)*f[22]+CONSTANT(-0.230329432978999990)*f[24]; - tg = CONSTANT(0.226179013155000000)*g[8]+CONSTANT(-0.043528171378199997)*g[22]+CONSTANT(-0.230329432978999990)*g[24]; - y[1] += tf*g[9]+tg*f[9]; - y[9] = tf*g[1]+tg*f[1]; - t = f[1]*g[9]+f[9]*g[1]; - y[8] += CONSTANT(0.226179013155000000)*t; - y[22] = CONSTANT(-0.043528171378199997)*t; - y[24] = CONSTANT(-0.230329432978999990)*t; - - // [1,10]: 7,21,23, - tf = CONSTANT(0.184674390919999990)*f[7]+CONSTANT(-0.075393004386799994)*f[21]+CONSTANT(-0.199471140200000010)*f[23]; - tg = CONSTANT(0.184674390919999990)*g[7]+CONSTANT(-0.075393004386799994)*g[21]+CONSTANT(-0.199471140200000010)*g[23]; - y[1] += tf*g[10]+tg*f[10]; - y[10] = tf*g[1]+tg*f[1]; - t = f[1]*g[10]+f[10]*g[1]; - y[7] = CONSTANT(0.184674390919999990)*t; - y[21] = CONSTANT(-0.075393004386799994)*t; - y[23] = CONSTANT(-0.199471140200000010)*t; - - // [1,11]: 6,8,20,22, - tf = CONSTANT(0.202300659402999990)*f[6]+CONSTANT(0.058399170081799998)*f[8]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(-0.168583882836999990)*f[22]; - tg = CONSTANT(0.202300659402999990)*g[6]+CONSTANT(0.058399170081799998)*g[8]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(-0.168583882836999990)*g[22]; - y[1] += tf*g[11]+tg*f[11]; - y[11] = tf*g[1]+tg*f[1]; - t = f[1]*g[11]+f[11]*g[1]; - y[6] += CONSTANT(0.202300659402999990)*t; - y[8] += CONSTANT(0.058399170081799998)*t; - y[20] = CONSTANT(-0.150786008773000000)*t; - y[22] += CONSTANT(-0.168583882836999990)*t; - - // [1,12]: 19, - tf = CONSTANT(0.194663900273000010)*f[19]; - tg = CONSTANT(0.194663900273000010)*g[19]; - y[1] += tf*g[12]+tg*f[12]; - y[12] += tf*g[1]+tg*f[1]; - t = f[1]*g[12]+f[12]*g[1]; - y[19] = CONSTANT(0.194663900273000010)*t; - - // [1,13]: 18, - tf = CONSTANT(0.168583882834000000)*f[18]; - tg = CONSTANT(0.168583882834000000)*g[18]; - y[1] += tf*g[13]+tg*f[13]; - y[13] += tf*g[1]+tg*f[1]; - t = f[1]*g[13]+f[13]*g[1]; - y[18] = CONSTANT(0.168583882834000000)*t; - - // [1,14]: 17,19, - tf = CONSTANT(0.199471140196999990)*f[17]+CONSTANT(0.075393004386399995)*f[19]; - tg = CONSTANT(0.199471140196999990)*g[17]+CONSTANT(0.075393004386399995)*g[19]; - y[1] += tf*g[14]+tg*f[14]; - y[14] += tf*g[1]+tg*f[1]; - t = f[1]*g[14]+f[14]*g[1]; - y[17] = CONSTANT(0.199471140196999990)*t; - y[19] += CONSTANT(0.075393004386399995)*t; - - // [1,15]: 16,18, - tf = CONSTANT(0.230329432973999990)*f[16]+CONSTANT(0.043528171377799997)*f[18]; - tg = CONSTANT(0.230329432973999990)*g[16]+CONSTANT(0.043528171377799997)*g[18]; - y[1] += tf*g[15]+tg*f[15]; - y[15] += tf*g[1]+tg*f[1]; - t = f[1]*g[15]+f[15]*g[1]; - y[16] = CONSTANT(0.230329432973999990)*t; - y[18] += CONSTANT(0.043528171377799997)*t; - - // [2,2]: 0,6, - tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6]; - tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6]; - y[2] += tf*g[2]+tg*f[2]; - t = f[2]*g[2]; - y[0] += CONSTANT(0.282094795249000000)*t; - y[6] += CONSTANT(0.252313259986999990)*t; - - // [2,10]: 4,18, - tf = CONSTANT(0.184674390919999990)*f[4]+CONSTANT(0.213243618621000000)*f[18]; - tg = CONSTANT(0.184674390919999990)*g[4]+CONSTANT(0.213243618621000000)*g[18]; - y[2] += tf*g[10]+tg*f[10]; - y[10] += tf*g[2]+tg*f[2]; - t = f[2]*g[10]+f[10]*g[2]; - y[4] += CONSTANT(0.184674390919999990)*t; - y[18] += CONSTANT(0.213243618621000000)*t; - - // [2,12]: 6,20, - tf = CONSTANT(0.247766706973999990)*f[6]+CONSTANT(0.246232537174000010)*f[20]; - tg = CONSTANT(0.247766706973999990)*g[6]+CONSTANT(0.246232537174000010)*g[20]; - y[2] += tf*g[12]+tg*f[12]; - y[12] += tf*g[2]+tg*f[2]; - t = f[2]*g[12]+f[12]*g[2]; - y[6] += CONSTANT(0.247766706973999990)*t; - y[20] += CONSTANT(0.246232537174000010)*t; - - // [2,14]: 8,22, - tf = CONSTANT(0.184674390919999990)*f[8]+CONSTANT(0.213243618621000000)*f[22]; - tg = CONSTANT(0.184674390919999990)*g[8]+CONSTANT(0.213243618621000000)*g[22]; - y[2] += tf*g[14]+tg*f[14]; - y[14] += tf*g[2]+tg*f[2]; - t = f[2]*g[14]+f[14]*g[2]; - y[8] += CONSTANT(0.184674390919999990)*t; - y[22] += CONSTANT(0.213243618621000000)*t; - - // [3,3]: 0,6,8, - tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8]; - tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8]; - y[3] += tf*g[3]+tg*f[3]; - t = f[3]*g[3]; - y[0] += CONSTANT(0.282094791773000010)*t; - y[6] += CONSTANT(-0.126156626101000010)*t; - y[8] += CONSTANT(0.218509686119999990)*t; - - // [3,7]: 2,12,14, - tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(0.184674390923000000)*f[14]; - tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(0.184674390923000000)*g[14]; - y[3] += tf*g[7]+tg*f[7]; - y[7] += tf*g[3]+tg*f[3]; - t = f[3]*g[7]+f[7]*g[3]; - y[2] += CONSTANT(0.218509686118000010)*t; - y[12] += CONSTANT(-0.143048168103000000)*t; - y[14] += CONSTANT(0.184674390923000000)*t; - - // [3,9]: 4,16,18, - tf = CONSTANT(0.226179013157999990)*f[4]+CONSTANT(0.230329432973999990)*f[16]+CONSTANT(-0.043528171377799997)*f[18]; - tg = CONSTANT(0.226179013157999990)*g[4]+CONSTANT(0.230329432973999990)*g[16]+CONSTANT(-0.043528171377799997)*g[18]; - y[3] += tf*g[9]+tg*f[9]; - y[9] += tf*g[3]+tg*f[3]; - t = f[3]*g[9]+f[9]*g[3]; - y[4] += CONSTANT(0.226179013157999990)*t; - y[16] += CONSTANT(0.230329432973999990)*t; - y[18] += CONSTANT(-0.043528171377799997)*t; - - // [3,10]: 5,17,19, - tf = CONSTANT(0.184674390919999990)*f[5]+CONSTANT(0.199471140200000010)*f[17]+CONSTANT(-0.075393004386799994)*f[19]; - tg = CONSTANT(0.184674390919999990)*g[5]+CONSTANT(0.199471140200000010)*g[17]+CONSTANT(-0.075393004386799994)*g[19]; - y[3] += tf*g[10]+tg*f[10]; - y[10] += tf*g[3]+tg*f[3]; - t = f[3]*g[10]+f[10]*g[3]; - y[5] += CONSTANT(0.184674390919999990)*t; - y[17] += CONSTANT(0.199471140200000010)*t; - y[19] += CONSTANT(-0.075393004386799994)*t; - - // [3,12]: 21, - tf = CONSTANT(0.194663900273000010)*f[21]; - tg = CONSTANT(0.194663900273000010)*g[21]; - y[3] += tf*g[12]+tg*f[12]; - y[12] += tf*g[3]+tg*f[3]; - t = f[3]*g[12]+f[12]*g[3]; - y[21] += CONSTANT(0.194663900273000010)*t; - - // [3,13]: 8,6,20,22, - tf = CONSTANT(-0.058399170081799998)*f[8]+CONSTANT(0.202300659402999990)*f[6]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(0.168583882836999990)*f[22]; - tg = CONSTANT(-0.058399170081799998)*g[8]+CONSTANT(0.202300659402999990)*g[6]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(0.168583882836999990)*g[22]; - y[3] += tf*g[13]+tg*f[13]; - y[13] += tf*g[3]+tg*f[3]; - t = f[3]*g[13]+f[13]*g[3]; - y[8] += CONSTANT(-0.058399170081799998)*t; - y[6] += CONSTANT(0.202300659402999990)*t; - y[20] += CONSTANT(-0.150786008773000000)*t; - y[22] += CONSTANT(0.168583882836999990)*t; - - // [3,14]: 21,23, - tf = CONSTANT(-0.075393004386399995)*f[21]+CONSTANT(0.199471140196999990)*f[23]; - tg = CONSTANT(-0.075393004386399995)*g[21]+CONSTANT(0.199471140196999990)*g[23]; - y[3] += tf*g[14]+tg*f[14]; - y[14] += tf*g[3]+tg*f[3]; - t = f[3]*g[14]+f[14]*g[3]; - y[21] += CONSTANT(-0.075393004386399995)*t; - y[23] += CONSTANT(0.199471140196999990)*t; - - // [3,15]: 8,22,24, - tf = CONSTANT(0.226179013155000000)*f[8]+CONSTANT(-0.043528171378199997)*f[22]+CONSTANT(0.230329432978999990)*f[24]; - tg = CONSTANT(0.226179013155000000)*g[8]+CONSTANT(-0.043528171378199997)*g[22]+CONSTANT(0.230329432978999990)*g[24]; - y[3] += tf*g[15]+tg*f[15]; - y[15] += tf*g[3]+tg*f[3]; - t = f[3]*g[15]+f[15]*g[3]; - y[8] += CONSTANT(0.226179013155000000)*t; - y[22] += CONSTANT(-0.043528171378199997)*t; - y[24] += CONSTANT(0.230329432978999990)*t; - - // [4,4]: 0,6,20,24, - tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(-0.238413613505999990)*f[24]; - tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(-0.238413613505999990)*g[24]; - y[4] += tf*g[4]+tg*f[4]; - t = f[4]*g[4]; - y[0] += CONSTANT(0.282094791770000020)*t; - y[6] += CONSTANT(-0.180223751576000010)*t; - y[20] += CONSTANT(0.040299255967500003)*t; - y[24] += CONSTANT(-0.238413613505999990)*t; - - // [4,5]: 7,21,23, - tf = CONSTANT(0.156078347226000000)*f[7]+CONSTANT(-0.063718718434399996)*f[21]+CONSTANT(-0.168583882835000000)*f[23]; - tg = CONSTANT(0.156078347226000000)*g[7]+CONSTANT(-0.063718718434399996)*g[21]+CONSTANT(-0.168583882835000000)*g[23]; - y[4] += tf*g[5]+tg*f[5]; - y[5] += tf*g[4]+tg*f[4]; - t = f[4]*g[5]+f[5]*g[4]; - y[7] += CONSTANT(0.156078347226000000)*t; - y[21] += CONSTANT(-0.063718718434399996)*t; - y[23] += CONSTANT(-0.168583882835000000)*t; - - // [4,11]: 3,13,15, - tf = CONSTANT(-0.058399170082300000)*f[3]+CONSTANT(0.145673124078000010)*f[13]+CONSTANT(0.094031597258400004)*f[15]; - tg = CONSTANT(-0.058399170082300000)*g[3]+CONSTANT(0.145673124078000010)*g[13]+CONSTANT(0.094031597258400004)*g[15]; - y[4] += tf*g[11]+tg*f[11]; - y[11] += tf*g[4]+tg*f[4]; - t = f[4]*g[11]+f[11]*g[4]; - y[3] += CONSTANT(-0.058399170082300000)*t; - y[13] += CONSTANT(0.145673124078000010)*t; - y[15] += CONSTANT(0.094031597258400004)*t; - - // [4,16]: 8,22, - tf = CONSTANT(0.238413613494000000)*f[8]+CONSTANT(-0.075080816693699995)*f[22]; - tg = CONSTANT(0.238413613494000000)*g[8]+CONSTANT(-0.075080816693699995)*g[22]; - y[4] += tf*g[16]+tg*f[16]; - y[16] += tf*g[4]+tg*f[4]; - t = f[4]*g[16]+f[16]*g[4]; - y[8] += CONSTANT(0.238413613494000000)*t; - y[22] += CONSTANT(-0.075080816693699995)*t; - - // [4,18]: 6,20,24, - tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(0.075080816691500005)*f[24]; - tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(0.075080816691500005)*g[24]; - y[4] += tf*g[18]+tg*f[18]; - y[18] += tf*g[4]+tg*f[4]; - t = f[4]*g[18]+f[18]*g[4]; - y[6] += CONSTANT(0.156078347226000000)*t; - y[20] += CONSTANT(-0.190364615029000010)*t; - y[24] += CONSTANT(0.075080816691500005)*t; - - // [4,19]: 7,21,23, - tf = CONSTANT(-0.063718718434399996)*f[7]+CONSTANT(0.141889406569999990)*f[21]+CONSTANT(0.112621225039000000)*f[23]; - tg = CONSTANT(-0.063718718434399996)*g[7]+CONSTANT(0.141889406569999990)*g[21]+CONSTANT(0.112621225039000000)*g[23]; - y[4] += tf*g[19]+tg*f[19]; - y[19] += tf*g[4]+tg*f[4]; - t = f[4]*g[19]+f[19]*g[4]; - y[7] += CONSTANT(-0.063718718434399996)*t; - y[21] += CONSTANT(0.141889406569999990)*t; - y[23] += CONSTANT(0.112621225039000000)*t; - - // [5,5]: 0,6,8,20,22, - tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(-0.180223751574000000)*f[22]; - tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(-0.180223751574000000)*g[22]; - y[5] += tf*g[5]+tg*f[5]; - t = f[5]*g[5]; - y[0] += CONSTANT(0.282094791773999990)*t; - y[6] += CONSTANT(0.090111875786499998)*t; - y[8] += CONSTANT(-0.156078347227999990)*t; - y[20] += CONSTANT(-0.161197023870999990)*t; - y[22] += CONSTANT(-0.180223751574000000)*t; - - // [5,11]: 2,12,14, - tf = CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.059470803871800003)*f[12]+CONSTANT(-0.115164716491000000)*f[14]; - tg = CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.059470803871800003)*g[12]+CONSTANT(-0.115164716491000000)*g[14]; - y[5] += tf*g[11]+tg*f[11]; - y[11] += tf*g[5]+tg*f[5]; - t = f[5]*g[11]+f[11]*g[5]; - y[2] += CONSTANT(0.233596680327000010)*t; - y[12] += CONSTANT(0.059470803871800003)*t; - y[14] += CONSTANT(-0.115164716491000000)*t; - - // [5,17]: 8,22,24, - tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(-0.140463346189000000)*f[24]; - tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(-0.140463346189000000)*g[24]; - y[5] += tf*g[17]+tg*f[17]; - y[17] += tf*g[5]+tg*f[5]; - t = f[5]*g[17]+f[17]*g[5]; - y[8] += CONSTANT(0.168583882832999990)*t; - y[22] += CONSTANT(0.132725386548000010)*t; - y[24] += CONSTANT(-0.140463346189000000)*t; - - // [5,18]: 7,21,23, - tf = CONSTANT(0.180223751571000010)*f[7]+CONSTANT(0.090297865407399994)*f[21]+CONSTANT(-0.132725386549000010)*f[23]; - tg = CONSTANT(0.180223751571000010)*g[7]+CONSTANT(0.090297865407399994)*g[21]+CONSTANT(-0.132725386549000010)*g[23]; - y[5] += tf*g[18]+tg*f[18]; - y[18] += tf*g[5]+tg*f[5]; - t = f[5]*g[18]+f[18]*g[5]; - y[7] += CONSTANT(0.180223751571000010)*t; - y[21] += CONSTANT(0.090297865407399994)*t; - y[23] += CONSTANT(-0.132725386549000010)*t; - - // [5,19]: 6,8,20,22, - tf = CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(-0.090297865408399999)*f[22]; - tg = CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(-0.090297865408399999)*g[22]; - y[5] += tf*g[19]+tg*f[19]; - y[19] += tf*g[5]+tg*f[5]; - t = f[5]*g[19]+f[19]*g[5]; - y[6] += CONSTANT(0.220728115440999990)*t; - y[8] += CONSTANT(0.063718718433900007)*t; - y[20] += CONSTANT(0.044869370061299998)*t; - y[22] += CONSTANT(-0.090297865408399999)*t; - - // [6,6]: 0,6,20, - tf = CONSTANT(0.282094797560000000)*f[0]+CONSTANT(0.241795553185999990)*f[20]; - tg = CONSTANT(0.282094797560000000)*g[0]+CONSTANT(0.241795553185999990)*g[20]; - y[6] += tf*g[6]+tg*f[6]; - t = f[6]*g[6]; - y[0] += CONSTANT(0.282094797560000000)*t; - y[6] += CONSTANT(0.180223764527000010)*t; - y[20] += CONSTANT(0.241795553185999990)*t; - - // [7,7]: 6,0,8,20,22, - tf = CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(0.180223751574000000)*f[22]; - tg = CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(0.180223751574000000)*g[22]; - y[7] += tf*g[7]+tg*f[7]; - t = f[7]*g[7]; - y[6] += CONSTANT(0.090111875786499998)*t; - y[0] += CONSTANT(0.282094791773999990)*t; - y[8] += CONSTANT(0.156078347227999990)*t; - y[20] += CONSTANT(-0.161197023870999990)*t; - y[22] += CONSTANT(0.180223751574000000)*t; - - // [7,13]: 12,2,14, - tf = CONSTANT(0.059470803871800003)*f[12]+CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.115164716491000000)*f[14]; - tg = CONSTANT(0.059470803871800003)*g[12]+CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.115164716491000000)*g[14]; - y[7] += tf*g[13]+tg*f[13]; - y[13] += tf*g[7]+tg*f[7]; - t = f[7]*g[13]+f[13]*g[7]; - y[12] += CONSTANT(0.059470803871800003)*t; - y[2] += CONSTANT(0.233596680327000010)*t; - y[14] += CONSTANT(0.115164716491000000)*t; - - // [7,17]: 16,4,18, - tf = CONSTANT(0.140463346187999990)*f[16]+CONSTANT(0.168583882835000000)*f[4]+CONSTANT(0.132725386549000010)*f[18]; - tg = CONSTANT(0.140463346187999990)*g[16]+CONSTANT(0.168583882835000000)*g[4]+CONSTANT(0.132725386549000010)*g[18]; - y[7] += tf*g[17]+tg*f[17]; - y[17] += tf*g[7]+tg*f[7]; - t = f[7]*g[17]+f[17]*g[7]; - y[16] += CONSTANT(0.140463346187999990)*t; - y[4] += CONSTANT(0.168583882835000000)*t; - y[18] += CONSTANT(0.132725386549000010)*t; - - // [7,21]: 8,20,6,22, - tf = CONSTANT(-0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.090297865408399999)*f[22]; - tg = CONSTANT(-0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.090297865408399999)*g[22]; - y[7] += tf*g[21]+tg*f[21]; - y[21] += tf*g[7]+tg*f[7]; - t = f[7]*g[21]+f[21]*g[7]; - y[8] += CONSTANT(-0.063718718433900007)*t; - y[20] += CONSTANT(0.044869370061299998)*t; - y[6] += CONSTANT(0.220728115440999990)*t; - y[22] += CONSTANT(0.090297865408399999)*t; - - // [7,23]: 8,22,24, - tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(0.140463346189000000)*f[24]; - tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(0.140463346189000000)*g[24]; - y[7] += tf*g[23]+tg*f[23]; - y[23] += tf*g[7]+tg*f[7]; - t = f[7]*g[23]+f[23]*g[7]; - y[8] += CONSTANT(0.168583882832999990)*t; - y[22] += CONSTANT(0.132725386548000010)*t; - y[24] += CONSTANT(0.140463346189000000)*t; - - // [8,8]: 0,6,20,24, - tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(0.238413613505999990)*f[24]; - tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(0.238413613505999990)*g[24]; - y[8] += tf*g[8]+tg*f[8]; - t = f[8]*g[8]; - y[0] += CONSTANT(0.282094791770000020)*t; - y[6] += CONSTANT(-0.180223751576000010)*t; - y[20] += CONSTANT(0.040299255967500003)*t; - y[24] += CONSTANT(0.238413613505999990)*t; - - // [8,22]: 6,20,24, - tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(-0.075080816691500005)*f[24]; - tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(-0.075080816691500005)*g[24]; - y[8] += tf*g[22]+tg*f[22]; - y[22] += tf*g[8]+tg*f[8]; - t = f[8]*g[22]+f[22]*g[8]; - y[6] += CONSTANT(0.156078347226000000)*t; - y[20] += CONSTANT(-0.190364615029000010)*t; - y[24] += CONSTANT(-0.075080816691500005)*t; - - // [9,9]: 6,0,20, - tf = CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.282094791766999970)*f[0]+CONSTANT(0.076934943209800002)*f[20]; - tg = CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.282094791766999970)*g[0]+CONSTANT(0.076934943209800002)*g[20]; - y[9] += tf*g[9]+tg*f[9]; - t = f[9]*g[9]; - y[6] += CONSTANT(-0.210261043508000010)*t; - y[0] += CONSTANT(0.282094791766999970)*t; - y[20] += CONSTANT(0.076934943209800002)*t; - - // [9,10]: 7,21, - tf = CONSTANT(0.148677009678999990)*f[7]+CONSTANT(-0.099322584599600000)*f[21]; - tg = CONSTANT(0.148677009678999990)*g[7]+CONSTANT(-0.099322584599600000)*g[21]; - y[9] += tf*g[10]+tg*f[10]; - y[10] += tf*g[9]+tg*f[9]; - t = f[9]*g[10]+f[10]*g[9]; - y[7] += CONSTANT(0.148677009678999990)*t; - y[21] += CONSTANT(-0.099322584599600000)*t; - - // [9,11]: 8,22,24, - tf = CONSTANT(-0.094031597259499999)*f[8]+CONSTANT(0.133255230518000010)*f[22]+CONSTANT(0.117520066950999990)*f[24]; - tg = CONSTANT(-0.094031597259499999)*g[8]+CONSTANT(0.133255230518000010)*g[22]+CONSTANT(0.117520066950999990)*g[24]; - y[9] += tf*g[11]+tg*f[11]; - y[11] += tf*g[9]+tg*f[9]; - t = f[9]*g[11]+f[11]*g[9]; - y[8] += CONSTANT(-0.094031597259499999)*t; - y[22] += CONSTANT(0.133255230518000010)*t; - y[24] += CONSTANT(0.117520066950999990)*t; - - // [9,13]: 4,16,18, - tf = CONSTANT(-0.094031597258400004)*f[4]+CONSTANT(-0.117520066953000000)*f[16]+CONSTANT(0.133255230519000010)*f[18]; - tg = CONSTANT(-0.094031597258400004)*g[4]+CONSTANT(-0.117520066953000000)*g[16]+CONSTANT(0.133255230519000010)*g[18]; - y[9] += tf*g[13]+tg*f[13]; - y[13] += tf*g[9]+tg*f[9]; - t = f[9]*g[13]+f[13]*g[9]; - y[4] += CONSTANT(-0.094031597258400004)*t; - y[16] += CONSTANT(-0.117520066953000000)*t; - y[18] += CONSTANT(0.133255230519000010)*t; - - // [9,14]: 5,19, - tf = CONSTANT(0.148677009677999990)*f[5]+CONSTANT(-0.099322584600699995)*f[19]; - tg = CONSTANT(0.148677009677999990)*g[5]+CONSTANT(-0.099322584600699995)*g[19]; - y[9] += tf*g[14]+tg*f[14]; - y[14] += tf*g[9]+tg*f[9]; - t = f[9]*g[14]+f[14]*g[9]; - y[5] += CONSTANT(0.148677009677999990)*t; - y[19] += CONSTANT(-0.099322584600699995)*t; - - // [9,17]: 2,12, - tf = CONSTANT(0.162867503964999990)*f[2]+CONSTANT(-0.203550726872999990)*f[12]; - tg = CONSTANT(0.162867503964999990)*g[2]+CONSTANT(-0.203550726872999990)*g[12]; - y[9] += tf*g[17]+tg*f[17]; - y[17] += tf*g[9]+tg*f[9]; - t = f[9]*g[17]+f[17]*g[9]; - y[2] += CONSTANT(0.162867503964999990)*t; - y[12] += CONSTANT(-0.203550726872999990)*t; - - // [10,10]: 0,20,24, - tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(-0.151717754049000010)*f[24]; - tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(-0.151717754049000010)*g[24]; - y[10] += tf*g[10]+tg*f[10]; - t = f[10]*g[10]; - y[0] += CONSTANT(0.282094791771999980)*t; - y[20] += CONSTANT(-0.179514867494000000)*t; - y[24] += CONSTANT(-0.151717754049000010)*t; - - // [10,11]: 7,21,23, - tf = CONSTANT(0.115164716490000000)*f[7]+CONSTANT(0.102579924281000000)*f[21]+CONSTANT(-0.067850242288900006)*f[23]; - tg = CONSTANT(0.115164716490000000)*g[7]+CONSTANT(0.102579924281000000)*g[21]+CONSTANT(-0.067850242288900006)*g[23]; - y[10] += tf*g[11]+tg*f[11]; - y[11] += tf*g[10]+tg*f[10]; - t = f[10]*g[11]+f[11]*g[10]; - y[7] += CONSTANT(0.115164716490000000)*t; - y[21] += CONSTANT(0.102579924281000000)*t; - y[23] += CONSTANT(-0.067850242288900006)*t; - - // [10,12]: 4,18, - tf = CONSTANT(-0.188063194517999990)*f[4]+CONSTANT(-0.044418410173299998)*f[18]; - tg = CONSTANT(-0.188063194517999990)*g[4]+CONSTANT(-0.044418410173299998)*g[18]; - y[10] += tf*g[12]+tg*f[12]; - y[12] += tf*g[10]+tg*f[10]; - t = f[10]*g[12]+f[12]*g[10]; - y[4] += CONSTANT(-0.188063194517999990)*t; - y[18] += CONSTANT(-0.044418410173299998)*t; - - // [10,13]: 5,17,19, - tf = CONSTANT(0.115164716490000000)*f[5]+CONSTANT(0.067850242288900006)*f[17]+CONSTANT(0.102579924281000000)*f[19]; - tg = CONSTANT(0.115164716490000000)*g[5]+CONSTANT(0.067850242288900006)*g[17]+CONSTANT(0.102579924281000000)*g[19]; - y[10] += tf*g[13]+tg*f[13]; - y[13] += tf*g[10]+tg*f[10]; - t = f[10]*g[13]+f[13]*g[10]; - y[5] += CONSTANT(0.115164716490000000)*t; - y[17] += CONSTANT(0.067850242288900006)*t; - y[19] += CONSTANT(0.102579924281000000)*t; - - // [10,14]: 16, - tf = CONSTANT(0.151717754044999990)*f[16]; - tg = CONSTANT(0.151717754044999990)*g[16]; - y[10] += tf*g[14]+tg*f[14]; - y[14] += tf*g[10]+tg*f[10]; - t = f[10]*g[14]+f[14]*g[10]; - y[16] += CONSTANT(0.151717754044999990)*t; - - // [10,15]: 5,19, - tf = CONSTANT(-0.148677009678999990)*f[5]+CONSTANT(0.099322584599600000)*f[19]; - tg = CONSTANT(-0.148677009678999990)*g[5]+CONSTANT(0.099322584599600000)*g[19]; - y[10] += tf*g[15]+tg*f[15]; - y[15] += tf*g[10]+tg*f[10]; - t = f[10]*g[15]+f[15]*g[10]; - y[5] += CONSTANT(-0.148677009678999990)*t; - y[19] += CONSTANT(0.099322584599600000)*t; - - // [11,11]: 0,6,8,20,22, - tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(-0.145673124078999990)*f[8]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(-0.114687841910000000)*f[22]; - tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(-0.145673124078999990)*g[8]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(-0.114687841910000000)*g[22]; - y[11] += tf*g[11]+tg*f[11]; - t = f[11]*g[11]; - y[0] += CONSTANT(0.282094791773999990)*t; - y[6] += CONSTANT(0.126156626101000010)*t; - y[8] += CONSTANT(-0.145673124078999990)*t; - y[20] += CONSTANT(0.025644981070299999)*t; - y[22] += CONSTANT(-0.114687841910000000)*t; - - // [11,14]: 17, - tf = CONSTANT(0.067850242288500007)*f[17]; - tg = CONSTANT(0.067850242288500007)*g[17]; - y[11] += tf*g[14]+tg*f[14]; - y[14] += tf*g[11]+tg*f[11]; - t = f[11]*g[14]+f[14]*g[11]; - y[17] += CONSTANT(0.067850242288500007)*t; - - // [11,15]: 16, - tf = CONSTANT(-0.117520066953000000)*f[16]; - tg = CONSTANT(-0.117520066953000000)*g[16]; - y[11] += tf*g[15]+tg*f[15]; - y[15] += tf*g[11]+tg*f[11]; - t = f[11]*g[15]+f[15]*g[11]; - y[16] += CONSTANT(-0.117520066953000000)*t; - - // [11,18]: 3,13,15, - tf = CONSTANT(0.168583882834000000)*f[3]+CONSTANT(0.114687841909000000)*f[13]+CONSTANT(-0.133255230519000010)*f[15]; - tg = CONSTANT(0.168583882834000000)*g[3]+CONSTANT(0.114687841909000000)*g[13]+CONSTANT(-0.133255230519000010)*g[15]; - y[11] += tf*g[18]+tg*f[18]; - y[18] += tf*g[11]+tg*f[11]; - t = f[11]*g[18]+f[18]*g[11]; - y[3] += CONSTANT(0.168583882834000000)*t; - y[13] += CONSTANT(0.114687841909000000)*t; - y[15] += CONSTANT(-0.133255230519000010)*t; - - // [11,19]: 2,14,12, - tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(-0.102579924282000000)*f[14]+CONSTANT(0.099322584599300004)*f[12]; - tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(-0.102579924282000000)*g[14]+CONSTANT(0.099322584599300004)*g[12]; - y[11] += tf*g[19]+tg*f[19]; - y[19] += tf*g[11]+tg*f[11]; - t = f[11]*g[19]+f[19]*g[11]; - y[2] += CONSTANT(0.238413613504000000)*t; - y[14] += CONSTANT(-0.102579924282000000)*t; - y[12] += CONSTANT(0.099322584599300004)*t; - - // [12,12]: 0,6,20, - tf = CONSTANT(0.282094799871999980)*f[0]+CONSTANT(0.168208852954000010)*f[6]+CONSTANT(0.153869910786000010)*f[20]; - tg = CONSTANT(0.282094799871999980)*g[0]+CONSTANT(0.168208852954000010)*g[6]+CONSTANT(0.153869910786000010)*g[20]; - y[12] += tf*g[12]+tg*f[12]; - t = f[12]*g[12]; - y[0] += CONSTANT(0.282094799871999980)*t; - y[6] += CONSTANT(0.168208852954000010)*t; - y[20] += CONSTANT(0.153869910786000010)*t; - - // [12,14]: 8,22, - tf = CONSTANT(-0.188063194517999990)*f[8]+CONSTANT(-0.044418410173299998)*f[22]; - tg = CONSTANT(-0.188063194517999990)*g[8]+CONSTANT(-0.044418410173299998)*g[22]; - y[12] += tf*g[14]+tg*f[14]; - y[14] += tf*g[12]+tg*f[12]; - t = f[12]*g[14]+f[14]*g[12]; - y[8] += CONSTANT(-0.188063194517999990)*t; - y[22] += CONSTANT(-0.044418410173299998)*t; - - // [13,13]: 0,8,6,20,22, - tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.145673124078999990)*f[8]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(0.114687841910000000)*f[22]; - tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.145673124078999990)*g[8]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(0.114687841910000000)*g[22]; - y[13] += tf*g[13]+tg*f[13]; - t = f[13]*g[13]; - y[0] += CONSTANT(0.282094791773999990)*t; - y[8] += CONSTANT(0.145673124078999990)*t; - y[6] += CONSTANT(0.126156626101000010)*t; - y[20] += CONSTANT(0.025644981070299999)*t; - y[22] += CONSTANT(0.114687841910000000)*t; - - // [13,14]: 23, - tf = CONSTANT(0.067850242288500007)*f[23]; - tg = CONSTANT(0.067850242288500007)*g[23]; - y[13] += tf*g[14]+tg*f[14]; - y[14] += tf*g[13]+tg*f[13]; - t = f[13]*g[14]+f[14]*g[13]; - y[23] += CONSTANT(0.067850242288500007)*t; - - // [13,15]: 8,22,24, - tf = CONSTANT(-0.094031597259499999)*f[8]+CONSTANT(0.133255230518000010)*f[22]+CONSTANT(-0.117520066950999990)*f[24]; - tg = CONSTANT(-0.094031597259499999)*g[8]+CONSTANT(0.133255230518000010)*g[22]+CONSTANT(-0.117520066950999990)*g[24]; - y[13] += tf*g[15]+tg*f[15]; - y[15] += tf*g[13]+tg*f[13]; - t = f[13]*g[15]+f[15]*g[13]; - y[8] += CONSTANT(-0.094031597259499999)*t; - y[22] += CONSTANT(0.133255230518000010)*t; - y[24] += CONSTANT(-0.117520066950999990)*t; - - // [13,21]: 2,12,14, - tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(0.099322584599300004)*f[12]+CONSTANT(0.102579924282000000)*f[14]; - tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(0.099322584599300004)*g[12]+CONSTANT(0.102579924282000000)*g[14]; - y[13] += tf*g[21]+tg*f[21]; - y[21] += tf*g[13]+tg*f[13]; - t = f[13]*g[21]+f[21]*g[13]; - y[2] += CONSTANT(0.238413613504000000)*t; - y[12] += CONSTANT(0.099322584599300004)*t; - y[14] += CONSTANT(0.102579924282000000)*t; - - // [14,14]: 0,20,24, - tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(0.151717754049000010)*f[24]; - tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(0.151717754049000010)*g[24]; - y[14] += tf*g[14]+tg*f[14]; - t = f[14]*g[14]; - y[0] += CONSTANT(0.282094791771999980)*t; - y[20] += CONSTANT(-0.179514867494000000)*t; - y[24] += CONSTANT(0.151717754049000010)*t; - - // [14,15]: 7,21, - tf = CONSTANT(0.148677009677999990)*f[7]+CONSTANT(-0.099322584600699995)*f[21]; - tg = CONSTANT(0.148677009677999990)*g[7]+CONSTANT(-0.099322584600699995)*g[21]; - y[14] += tf*g[15]+tg*f[15]; - y[15] += tf*g[14]+tg*f[14]; - t = f[14]*g[15]+f[15]*g[14]; - y[7] += CONSTANT(0.148677009677999990)*t; - y[21] += CONSTANT(-0.099322584600699995)*t; - - // [15,15]: 0,6,20, - tf = CONSTANT(0.282094791766999970)*f[0]+CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.076934943209800002)*f[20]; - tg = CONSTANT(0.282094791766999970)*g[0]+CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.076934943209800002)*g[20]; - y[15] += tf*g[15]+tg*f[15]; - t = f[15]*g[15]; - y[0] += CONSTANT(0.282094791766999970)*t; - y[6] += CONSTANT(-0.210261043508000010)*t; - y[20] += CONSTANT(0.076934943209800002)*t; - - // [15,23]: 12,2, - tf = CONSTANT(-0.203550726872999990)*f[12]+CONSTANT(0.162867503964999990)*f[2]; - tg = CONSTANT(-0.203550726872999990)*g[12]+CONSTANT(0.162867503964999990)*g[2]; - y[15] += tf*g[23]+tg*f[23]; - y[23] += tf*g[15]+tg*f[15]; - t = f[15]*g[23]+f[23]*g[15]; - y[12] += CONSTANT(-0.203550726872999990)*t; - y[2] += CONSTANT(0.162867503964999990)*t; - - // [16,16]: 0,6,20, - tf = CONSTANT(0.282094791763999990)*f[0]+CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.106525305981000000)*f[20]; - tg = CONSTANT(0.282094791763999990)*g[0]+CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.106525305981000000)*g[20]; - y[16] += tf*g[16]+tg*f[16]; - t = f[16]*g[16]; - y[0] += CONSTANT(0.282094791763999990)*t; - y[6] += CONSTANT(-0.229375683829000000)*t; - y[20] += CONSTANT(0.106525305981000000)*t; - - // [16,18]: 8,22, - tf = CONSTANT(-0.075080816693699995)*f[8]+CONSTANT(0.135045473380000000)*f[22]; - tg = CONSTANT(-0.075080816693699995)*g[8]+CONSTANT(0.135045473380000000)*g[22]; - y[16] += tf*g[18]+tg*f[18]; - y[18] += tf*g[16]+tg*f[16]; - t = f[16]*g[18]+f[18]*g[16]; - y[8] += CONSTANT(-0.075080816693699995)*t; - y[22] += CONSTANT(0.135045473380000000)*t; - - // [16,23]: 19,5, - tf = CONSTANT(-0.119098912754999990)*f[19]+CONSTANT(0.140463346187999990)*f[5]; - tg = CONSTANT(-0.119098912754999990)*g[19]+CONSTANT(0.140463346187999990)*g[5]; - y[16] += tf*g[23]+tg*f[23]; - y[23] += tf*g[16]+tg*f[16]; - t = f[16]*g[23]+f[23]*g[16]; - y[19] += CONSTANT(-0.119098912754999990)*t; - y[5] += CONSTANT(0.140463346187999990)*t; - - // [17,17]: 0,6,20, - tf = CONSTANT(0.282094791768999990)*f[0]+CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20]; - tg = CONSTANT(0.282094791768999990)*g[0]+CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20]; - y[17] += tf*g[17]+tg*f[17]; - t = f[17]*g[17]; - y[0] += CONSTANT(0.282094791768999990)*t; - y[6] += CONSTANT(-0.057343920955899998)*t; - y[20] += CONSTANT(-0.159787958979000000)*t; - - // [17,19]: 8,22,24, - tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(0.119098912753000000)*f[24]; - tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(0.119098912753000000)*g[24]; - y[17] += tf*g[19]+tg*f[19]; - y[19] += tf*g[17]+tg*f[17]; - t = f[17]*g[19]+f[19]*g[17]; - y[8] += CONSTANT(-0.112621225039000000)*t; - y[22] += CONSTANT(0.045015157794100001)*t; - y[24] += CONSTANT(0.119098912753000000)*t; - - // [17,21]: 16,4,18, - tf = CONSTANT(-0.119098912754999990)*f[16]+CONSTANT(-0.112621225039000000)*f[4]+CONSTANT(0.045015157794399997)*f[18]; - tg = CONSTANT(-0.119098912754999990)*g[16]+CONSTANT(-0.112621225039000000)*g[4]+CONSTANT(0.045015157794399997)*g[18]; - y[17] += tf*g[21]+tg*f[21]; - y[21] += tf*g[17]+tg*f[17]; - t = f[17]*g[21]+f[21]*g[17]; - y[16] += CONSTANT(-0.119098912754999990)*t; - y[4] += CONSTANT(-0.112621225039000000)*t; - y[18] += CONSTANT(0.045015157794399997)*t; - - // [18,18]: 6,0,20,24, - tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(-0.135045473384000000)*f[24]; - tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(-0.135045473384000000)*g[24]; - y[18] += tf*g[18]+tg*f[18]; - t = f[18]*g[18]; - y[6] += CONSTANT(0.065535909662600006)*t; - y[0] += CONSTANT(0.282094791771999980)*t; - y[20] += CONSTANT(-0.083698454702400005)*t; - y[24] += CONSTANT(-0.135045473384000000)*t; - - // [18,19]: 7,21,23, - tf = CONSTANT(0.090297865407399994)*f[7]+CONSTANT(0.102084782359000000)*f[21]+CONSTANT(-0.045015157794399997)*f[23]; - tg = CONSTANT(0.090297865407399994)*g[7]+CONSTANT(0.102084782359000000)*g[21]+CONSTANT(-0.045015157794399997)*g[23]; - y[18] += tf*g[19]+tg*f[19]; - y[19] += tf*g[18]+tg*f[18]; - t = f[18]*g[19]+f[19]*g[18]; - y[7] += CONSTANT(0.090297865407399994)*t; - y[21] += CONSTANT(0.102084782359000000)*t; - y[23] += CONSTANT(-0.045015157794399997)*t; - - // [19,19]: 6,8,0,20,22, - tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(-0.141889406570999990)*f[8]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(-0.102084782360000000)*f[22]; - tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(-0.141889406570999990)*g[8]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(-0.102084782360000000)*g[22]; - y[19] += tf*g[19]+tg*f[19]; - t = f[19]*g[19]; - y[6] += CONSTANT(0.139263808033999990)*t; - y[8] += CONSTANT(-0.141889406570999990)*t; - y[0] += CONSTANT(0.282094791773999990)*t; - y[20] += CONSTANT(0.068480553847200004)*t; - y[22] += CONSTANT(-0.102084782360000000)*t; - - // [20,20]: 6,0,20, - tf = CONSTANT(0.163839797503000010)*f[6]+CONSTANT(0.282094802232000010)*f[0]; - tg = CONSTANT(0.163839797503000010)*g[6]+CONSTANT(0.282094802232000010)*g[0]; - y[20] += tf*g[20]+tg*f[20]; - t = f[20]*g[20]; - y[6] += CONSTANT(0.163839797503000010)*t; - y[0] += CONSTANT(0.282094802232000010)*t; - y[20] += CONSTANT(0.136961139005999990)*t; - - // [21,21]: 6,20,0,8,22, - tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.141889406570999990)*f[8]+CONSTANT(0.102084782360000000)*f[22]; - tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.141889406570999990)*g[8]+CONSTANT(0.102084782360000000)*g[22]; - y[21] += tf*g[21]+tg*f[21]; - t = f[21]*g[21]; - y[6] += CONSTANT(0.139263808033999990)*t; - y[20] += CONSTANT(0.068480553847200004)*t; - y[0] += CONSTANT(0.282094791773999990)*t; - y[8] += CONSTANT(0.141889406570999990)*t; - y[22] += CONSTANT(0.102084782360000000)*t; - - // [21,23]: 8,22,24, - tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(-0.119098912753000000)*f[24]; - tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(-0.119098912753000000)*g[24]; - y[21] += tf*g[23]+tg*f[23]; - y[23] += tf*g[21]+tg*f[21]; - t = f[21]*g[23]+f[23]*g[21]; - y[8] += CONSTANT(-0.112621225039000000)*t; - y[22] += CONSTANT(0.045015157794100001)*t; - y[24] += CONSTANT(-0.119098912753000000)*t; - - // [22,22]: 6,20,0,24, - tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(0.135045473384000000)*f[24]; - tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(0.135045473384000000)*g[24]; - y[22] += tf*g[22]+tg*f[22]; - t = f[22]*g[22]; - y[6] += CONSTANT(0.065535909662600006)*t; - y[20] += CONSTANT(-0.083698454702400005)*t; - y[0] += CONSTANT(0.282094791771999980)*t; - y[24] += CONSTANT(0.135045473384000000)*t; - - // [23,23]: 6,20,0, - tf = CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20]+CONSTANT(0.282094791768999990)*f[0]; - tg = CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20]+CONSTANT(0.282094791768999990)*g[0]; - y[23] += tf*g[23]+tg*f[23]; - t = f[23]*g[23]; - y[6] += CONSTANT(-0.057343920955899998)*t; - y[20] += CONSTANT(-0.159787958979000000)*t; - y[0] += CONSTANT(0.282094791768999990)*t; - - // [24,24]: 6,0,20, - tf = CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.282094791763999990)*f[0]+CONSTANT(0.106525305981000000)*f[20]; - tg = CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.282094791763999990)*g[0]+CONSTANT(0.106525305981000000)*g[20]; - y[24] += tf*g[24]+tg*f[24]; - t = f[24]*g[24]; - y[6] += CONSTANT(-0.229375683829000000)*t; - y[0] += CONSTANT(0.282094791763999990)*t; - y[20] += CONSTANT(0.106525305981000000)*t; - - // multiply count=1135 - - return y; -} - - -//------------------------------------------------------------------------------------- -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232909.aspx -//------------------------------------------------------------------------------------- -float* XMSHMultiply6( _Out_writes_(36) float *y, - _In_reads_(36) const float *f, - _In_reads_(36) const float *g ) -{ - if ( !y || !f || !g ) - return nullptr; - - REAL tf,tg,t; - // [0,0]: 0, - y[0] = CONSTANT(0.282094792935999980)*f[0]*g[0]; - - // [1,1]: 0,6,8, - tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8]; - tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8]; - y[1] = tf*g[1]+tg*f[1]; - t = f[1]*g[1]; - y[0] += CONSTANT(0.282094791773000010)*t; - y[6] = CONSTANT(-0.126156626101000010)*t; - y[8] = CONSTANT(-0.218509686119999990)*t; - - // [1,4]: 3,13,15, - tf = CONSTANT(0.218509686114999990)*f[3]+CONSTANT(-0.058399170082300000)*f[13]+CONSTANT(-0.226179013157999990)*f[15]; - tg = CONSTANT(0.218509686114999990)*g[3]+CONSTANT(-0.058399170082300000)*g[13]+CONSTANT(-0.226179013157999990)*g[15]; - y[1] += tf*g[4]+tg*f[4]; - y[4] = tf*g[1]+tg*f[1]; - t = f[1]*g[4]+f[4]*g[1]; - y[3] = CONSTANT(0.218509686114999990)*t; - y[13] = CONSTANT(-0.058399170082300000)*t; - y[15] = CONSTANT(-0.226179013157999990)*t; - - // [1,5]: 2,12, - tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]; - tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]; - y[1] += tf*g[5]+tg*f[5]; - y[5] = tf*g[1]+tg*f[1]; - t = f[1]*g[5]+f[5]*g[1]; - y[2] = CONSTANT(0.218509686118000010)*t; - y[12] = CONSTANT(-0.143048168103000000)*t; - - // [1,11]: 6,8,20,22, - tf = CONSTANT(0.202300659402999990)*f[6]+CONSTANT(0.058399170081799998)*f[8]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(-0.168583882836999990)*f[22]; - tg = CONSTANT(0.202300659402999990)*g[6]+CONSTANT(0.058399170081799998)*g[8]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(-0.168583882836999990)*g[22]; - y[1] += tf*g[11]+tg*f[11]; - y[11] = tf*g[1]+tg*f[1]; - t = f[1]*g[11]+f[11]*g[1]; - y[6] += CONSTANT(0.202300659402999990)*t; - y[8] += CONSTANT(0.058399170081799998)*t; - y[20] = CONSTANT(-0.150786008773000000)*t; - y[22] = CONSTANT(-0.168583882836999990)*t; - - // [1,16]: 15,33,35, - tf = CONSTANT(0.230329432973999990)*f[15]+CONSTANT(-0.034723468517399998)*f[33]+CONSTANT(-0.232932108051999990)*f[35]; - tg = CONSTANT(0.230329432973999990)*g[15]+CONSTANT(-0.034723468517399998)*g[33]+CONSTANT(-0.232932108051999990)*g[35]; - y[1] += tf*g[16]+tg*f[16]; - y[16] = tf*g[1]+tg*f[1]; - t = f[1]*g[16]+f[16]*g[1]; - y[15] += CONSTANT(0.230329432973999990)*t; - y[33] = CONSTANT(-0.034723468517399998)*t; - y[35] = CONSTANT(-0.232932108051999990)*t; - - // [1,18]: 15,13,31,33, - tf = CONSTANT(0.043528171377799997)*f[15]+CONSTANT(0.168583882834000000)*f[13]+CONSTANT(-0.085054779966799998)*f[31]+CONSTANT(-0.183739324705999990)*f[33]; - tg = CONSTANT(0.043528171377799997)*g[15]+CONSTANT(0.168583882834000000)*g[13]+CONSTANT(-0.085054779966799998)*g[31]+CONSTANT(-0.183739324705999990)*g[33]; - y[1] += tf*g[18]+tg*f[18]; - y[18] = tf*g[1]+tg*f[1]; - t = f[1]*g[18]+f[18]*g[1]; - y[15] += CONSTANT(0.043528171377799997)*t; - y[13] += CONSTANT(0.168583882834000000)*t; - y[31] = CONSTANT(-0.085054779966799998)*t; - y[33] += CONSTANT(-0.183739324705999990)*t; - - // [1,19]: 14,12,30,32, - tf = CONSTANT(0.075393004386399995)*f[14]+CONSTANT(0.194663900273000010)*f[12]+CONSTANT(-0.155288072037000010)*f[30]+CONSTANT(-0.159122922869999990)*f[32]; - tg = CONSTANT(0.075393004386399995)*g[14]+CONSTANT(0.194663900273000010)*g[12]+CONSTANT(-0.155288072037000010)*g[30]+CONSTANT(-0.159122922869999990)*g[32]; - y[1] += tf*g[19]+tg*f[19]; - y[19] = tf*g[1]+tg*f[1]; - t = f[1]*g[19]+f[19]*g[1]; - y[14] = CONSTANT(0.075393004386399995)*t; - y[12] += CONSTANT(0.194663900273000010)*t; - y[30] = CONSTANT(-0.155288072037000010)*t; - y[32] = CONSTANT(-0.159122922869999990)*t; - - // [1,24]: 9,25,27, - tf = CONSTANT(-0.230329432978999990)*f[9]+CONSTANT(0.232932108049000000)*f[25]+CONSTANT(0.034723468517100002)*f[27]; - tg = CONSTANT(-0.230329432978999990)*g[9]+CONSTANT(0.232932108049000000)*g[25]+CONSTANT(0.034723468517100002)*g[27]; - y[1] += tf*g[24]+tg*f[24]; - y[24] = tf*g[1]+tg*f[1]; - t = f[1]*g[24]+f[24]*g[1]; - y[9] = CONSTANT(-0.230329432978999990)*t; - y[25] = CONSTANT(0.232932108049000000)*t; - y[27] = CONSTANT(0.034723468517100002)*t; - - // [1,29]: 22,20, - tf = CONSTANT(0.085054779965999999)*f[22]+CONSTANT(0.190188269815000010)*f[20]; - tg = CONSTANT(0.085054779965999999)*g[22]+CONSTANT(0.190188269815000010)*g[20]; - y[1] += tf*g[29]+tg*f[29]; - y[29] = tf*g[1]+tg*f[1]; - t = f[1]*g[29]+f[29]*g[1]; - y[22] += CONSTANT(0.085054779965999999)*t; - y[20] += CONSTANT(0.190188269815000010)*t; - - // [2,2]: 0,6, - tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6]; - tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6]; - y[2] += tf*g[2]+tg*f[2]; - t = f[2]*g[2]; - y[0] += CONSTANT(0.282094795249000000)*t; - y[6] += CONSTANT(0.252313259986999990)*t; - - // [2,12]: 6,20, - tf = CONSTANT(0.247766706973999990)*f[6]+CONSTANT(0.246232537174000010)*f[20]; - tg = CONSTANT(0.247766706973999990)*g[6]+CONSTANT(0.246232537174000010)*g[20]; - y[2] += tf*g[12]+tg*f[12]; - y[12] += tf*g[2]+tg*f[2]; - t = f[2]*g[12]+f[12]*g[2]; - y[6] += CONSTANT(0.247766706973999990)*t; - y[20] += CONSTANT(0.246232537174000010)*t; - - // [2,20]: 30, - tf = CONSTANT(0.245532020560000010)*f[30]; - tg = CONSTANT(0.245532020560000010)*g[30]; - y[2] += tf*g[20]+tg*f[20]; - y[20] += tf*g[2]+tg*f[2]; - t = f[2]*g[20]+f[20]*g[2]; - y[30] += CONSTANT(0.245532020560000010)*t; - - // [3,3]: 0,6,8, - tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8]; - tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8]; - y[3] += tf*g[3]+tg*f[3]; - t = f[3]*g[3]; - y[0] += CONSTANT(0.282094791773000010)*t; - y[6] += CONSTANT(-0.126156626101000010)*t; - y[8] += CONSTANT(0.218509686119999990)*t; - - // [3,7]: 2,12, - tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]; - tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]; - y[3] += tf*g[7]+tg*f[7]; - y[7] = tf*g[3]+tg*f[3]; - t = f[3]*g[7]+f[7]*g[3]; - y[2] += CONSTANT(0.218509686118000010)*t; - y[12] += CONSTANT(-0.143048168103000000)*t; - - // [3,13]: 8,6,20,22, - tf = CONSTANT(-0.058399170081799998)*f[8]+CONSTANT(0.202300659402999990)*f[6]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(0.168583882836999990)*f[22]; - tg = CONSTANT(-0.058399170081799998)*g[8]+CONSTANT(0.202300659402999990)*g[6]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(0.168583882836999990)*g[22]; - y[3] += tf*g[13]+tg*f[13]; - y[13] += tf*g[3]+tg*f[3]; - t = f[3]*g[13]+f[13]*g[3]; - y[8] += CONSTANT(-0.058399170081799998)*t; - y[6] += CONSTANT(0.202300659402999990)*t; - y[20] += CONSTANT(-0.150786008773000000)*t; - y[22] += CONSTANT(0.168583882836999990)*t; - - // [3,16]: 9,25,27, - tf = CONSTANT(0.230329432973999990)*f[9]+CONSTANT(0.232932108051999990)*f[25]+CONSTANT(-0.034723468517399998)*f[27]; - tg = CONSTANT(0.230329432973999990)*g[9]+CONSTANT(0.232932108051999990)*g[25]+CONSTANT(-0.034723468517399998)*g[27]; - y[3] += tf*g[16]+tg*f[16]; - y[16] += tf*g[3]+tg*f[3]; - t = f[3]*g[16]+f[16]*g[3]; - y[9] += CONSTANT(0.230329432973999990)*t; - y[25] += CONSTANT(0.232932108051999990)*t; - y[27] += CONSTANT(-0.034723468517399998)*t; - - // [3,21]: 12,14,30,32, - tf = CONSTANT(0.194663900273000010)*f[12]+CONSTANT(-0.075393004386399995)*f[14]+CONSTANT(-0.155288072037000010)*f[30]+CONSTANT(0.159122922869999990)*f[32]; - tg = CONSTANT(0.194663900273000010)*g[12]+CONSTANT(-0.075393004386399995)*g[14]+CONSTANT(-0.155288072037000010)*g[30]+CONSTANT(0.159122922869999990)*g[32]; - y[3] += tf*g[21]+tg*f[21]; - y[21] = tf*g[3]+tg*f[3]; - t = f[3]*g[21]+f[21]*g[3]; - y[12] += CONSTANT(0.194663900273000010)*t; - y[14] += CONSTANT(-0.075393004386399995)*t; - y[30] += CONSTANT(-0.155288072037000010)*t; - y[32] += CONSTANT(0.159122922869999990)*t; - - // [3,24]: 15,33,35, - tf = CONSTANT(0.230329432978999990)*f[15]+CONSTANT(-0.034723468517100002)*f[33]+CONSTANT(0.232932108049000000)*f[35]; - tg = CONSTANT(0.230329432978999990)*g[15]+CONSTANT(-0.034723468517100002)*g[33]+CONSTANT(0.232932108049000000)*g[35]; - y[3] += tf*g[24]+tg*f[24]; - y[24] += tf*g[3]+tg*f[3]; - t = f[3]*g[24]+f[24]*g[3]; - y[15] += CONSTANT(0.230329432978999990)*t; - y[33] += CONSTANT(-0.034723468517100002)*t; - y[35] += CONSTANT(0.232932108049000000)*t; - - // [3,31]: 20,22, - tf = CONSTANT(0.190188269815000010)*f[20]+CONSTANT(-0.085054779965999999)*f[22]; - tg = CONSTANT(0.190188269815000010)*g[20]+CONSTANT(-0.085054779965999999)*g[22]; - y[3] += tf*g[31]+tg*f[31]; - y[31] += tf*g[3]+tg*f[3]; - t = f[3]*g[31]+f[31]*g[3]; - y[20] += CONSTANT(0.190188269815000010)*t; - y[22] += CONSTANT(-0.085054779965999999)*t; - - // [4,4]: 0,6,20,24, - tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(-0.238413613505999990)*f[24]; - tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(-0.238413613505999990)*g[24]; - y[4] += tf*g[4]+tg*f[4]; - t = f[4]*g[4]; - y[0] += CONSTANT(0.282094791770000020)*t; - y[6] += CONSTANT(-0.180223751576000010)*t; - y[20] += CONSTANT(0.040299255967500003)*t; - y[24] += CONSTANT(-0.238413613505999990)*t; - - // [4,5]: 7,21,23, - tf = CONSTANT(0.156078347226000000)*f[7]+CONSTANT(-0.063718718434399996)*f[21]+CONSTANT(-0.168583882835000000)*f[23]; - tg = CONSTANT(0.156078347226000000)*g[7]+CONSTANT(-0.063718718434399996)*g[21]+CONSTANT(-0.168583882835000000)*g[23]; - y[4] += tf*g[5]+tg*f[5]; - y[5] += tf*g[4]+tg*f[4]; - t = f[4]*g[5]+f[5]*g[4]; - y[7] += CONSTANT(0.156078347226000000)*t; - y[21] += CONSTANT(-0.063718718434399996)*t; - y[23] = CONSTANT(-0.168583882835000000)*t; - - // [4,9]: 3,13,31,35, - tf = CONSTANT(0.226179013157999990)*f[3]+CONSTANT(-0.094031597258400004)*f[13]+CONSTANT(0.016943317729299998)*f[31]+CONSTANT(-0.245532000542000000)*f[35]; - tg = CONSTANT(0.226179013157999990)*g[3]+CONSTANT(-0.094031597258400004)*g[13]+CONSTANT(0.016943317729299998)*g[31]+CONSTANT(-0.245532000542000000)*g[35]; - y[4] += tf*g[9]+tg*f[9]; - y[9] += tf*g[4]+tg*f[4]; - t = f[4]*g[9]+f[9]*g[4]; - y[3] += CONSTANT(0.226179013157999990)*t; - y[13] += CONSTANT(-0.094031597258400004)*t; - y[31] += CONSTANT(0.016943317729299998)*t; - y[35] += CONSTANT(-0.245532000542000000)*t; - - // [4,10]: 2,12,30,34, - tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12]+CONSTANT(0.053579475144400000)*f[30]+CONSTANT(-0.190188269816000010)*f[34]; - tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12]+CONSTANT(0.053579475144400000)*g[30]+CONSTANT(-0.190188269816000010)*g[34]; - y[4] += tf*g[10]+tg*f[10]; - y[10] = tf*g[4]+tg*f[4]; - t = f[4]*g[10]+f[10]*g[4]; - y[2] += CONSTANT(0.184674390919999990)*t; - y[12] += CONSTANT(-0.188063194517999990)*t; - y[30] += CONSTANT(0.053579475144400000)*t; - y[34] = CONSTANT(-0.190188269816000010)*t; - - // [4,11]: 3,13,15,31,33, - tf = CONSTANT(-0.058399170082300000)*f[3]+CONSTANT(0.145673124078000010)*f[13]+CONSTANT(0.094031597258400004)*f[15]+CONSTANT(-0.065621187395699998)*f[31]+CONSTANT(-0.141757966610000010)*f[33]; - tg = CONSTANT(-0.058399170082300000)*g[3]+CONSTANT(0.145673124078000010)*g[13]+CONSTANT(0.094031597258400004)*g[15]+CONSTANT(-0.065621187395699998)*g[31]+CONSTANT(-0.141757966610000010)*g[33]; - y[4] += tf*g[11]+tg*f[11]; - y[11] += tf*g[4]+tg*f[4]; - t = f[4]*g[11]+f[11]*g[4]; - y[3] += CONSTANT(-0.058399170082300000)*t; - y[13] += CONSTANT(0.145673124078000010)*t; - y[15] += CONSTANT(0.094031597258400004)*t; - y[31] += CONSTANT(-0.065621187395699998)*t; - y[33] += CONSTANT(-0.141757966610000010)*t; - - // [4,16]: 8,22, - tf = CONSTANT(0.238413613494000000)*f[8]+CONSTANT(-0.075080816693699995)*f[22]; - tg = CONSTANT(0.238413613494000000)*g[8]+CONSTANT(-0.075080816693699995)*g[22]; - y[4] += tf*g[16]+tg*f[16]; - y[16] += tf*g[4]+tg*f[4]; - t = f[4]*g[16]+f[16]*g[4]; - y[8] += CONSTANT(0.238413613494000000)*t; - y[22] += CONSTANT(-0.075080816693699995)*t; - - // [4,18]: 6,20,24, - tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(0.075080816691500005)*f[24]; - tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(0.075080816691500005)*g[24]; - y[4] += tf*g[18]+tg*f[18]; - y[18] += tf*g[4]+tg*f[4]; - t = f[4]*g[18]+f[18]*g[4]; - y[6] += CONSTANT(0.156078347226000000)*t; - y[20] += CONSTANT(-0.190364615029000010)*t; - y[24] += CONSTANT(0.075080816691500005)*t; - - // [4,19]: 7,21,23, - tf = CONSTANT(-0.063718718434399996)*f[7]+CONSTANT(0.141889406569999990)*f[21]+CONSTANT(0.112621225039000000)*f[23]; - tg = CONSTANT(-0.063718718434399996)*g[7]+CONSTANT(0.141889406569999990)*g[21]+CONSTANT(0.112621225039000000)*g[23]; - y[4] += tf*g[19]+tg*f[19]; - y[19] += tf*g[4]+tg*f[4]; - t = f[4]*g[19]+f[19]*g[4]; - y[7] += CONSTANT(-0.063718718434399996)*t; - y[21] += CONSTANT(0.141889406569999990)*t; - y[23] += CONSTANT(0.112621225039000000)*t; - - // [4,25]: 15,33, - tf = CONSTANT(0.245532000542000000)*f[15]+CONSTANT(-0.062641347680800000)*f[33]; - tg = CONSTANT(0.245532000542000000)*g[15]+CONSTANT(-0.062641347680800000)*g[33]; - y[4] += tf*g[25]+tg*f[25]; - y[25] += tf*g[4]+tg*f[4]; - t = f[4]*g[25]+f[25]*g[4]; - y[15] += CONSTANT(0.245532000542000000)*t; - y[33] += CONSTANT(-0.062641347680800000)*t; - - // [4,26]: 14,32, - tf = CONSTANT(0.190188269806999990)*f[14]+CONSTANT(-0.097043558542400002)*f[32]; - tg = CONSTANT(0.190188269806999990)*g[14]+CONSTANT(-0.097043558542400002)*g[32]; - y[4] += tf*g[26]+tg*f[26]; - y[26] = tf*g[4]+tg*f[4]; - t = f[4]*g[26]+f[26]*g[4]; - y[14] += CONSTANT(0.190188269806999990)*t; - y[32] += CONSTANT(-0.097043558542400002)*t; - - // [4,27]: 13,31,35, - tf = CONSTANT(0.141757966610000010)*f[13]+CONSTANT(-0.121034582549000000)*f[31]+CONSTANT(0.062641347680800000)*f[35]; - tg = CONSTANT(0.141757966610000010)*g[13]+CONSTANT(-0.121034582549000000)*g[31]+CONSTANT(0.062641347680800000)*g[35]; - y[4] += tf*g[27]+tg*f[27]; - y[27] += tf*g[4]+tg*f[4]; - t = f[4]*g[27]+f[27]*g[4]; - y[13] += CONSTANT(0.141757966610000010)*t; - y[31] += CONSTANT(-0.121034582549000000)*t; - y[35] += CONSTANT(0.062641347680800000)*t; - - // [4,28]: 12,30,34, - tf = CONSTANT(0.141757966609000000)*f[12]+CONSTANT(-0.191372478254000000)*f[30]+CONSTANT(0.097043558538899996)*f[34]; - tg = CONSTANT(0.141757966609000000)*g[12]+CONSTANT(-0.191372478254000000)*g[30]+CONSTANT(0.097043558538899996)*g[34]; - y[4] += tf*g[28]+tg*f[28]; - y[28] = tf*g[4]+tg*f[4]; - t = f[4]*g[28]+f[28]*g[4]; - y[12] += CONSTANT(0.141757966609000000)*t; - y[30] += CONSTANT(-0.191372478254000000)*t; - y[34] += CONSTANT(0.097043558538899996)*t; - - // [4,29]: 13,15,31,33, - tf = CONSTANT(-0.065621187395699998)*f[13]+CONSTANT(-0.016943317729299998)*f[15]+CONSTANT(0.140070311613999990)*f[31]+CONSTANT(0.121034582549000000)*f[33]; - tg = CONSTANT(-0.065621187395699998)*g[13]+CONSTANT(-0.016943317729299998)*g[15]+CONSTANT(0.140070311613999990)*g[31]+CONSTANT(0.121034582549000000)*g[33]; - y[4] += tf*g[29]+tg*f[29]; - y[29] += tf*g[4]+tg*f[4]; - t = f[4]*g[29]+f[29]*g[4]; - y[13] += CONSTANT(-0.065621187395699998)*t; - y[15] += CONSTANT(-0.016943317729299998)*t; - y[31] += CONSTANT(0.140070311613999990)*t; - y[33] += CONSTANT(0.121034582549000000)*t; - - // [5,5]: 0,6,8,20,22, - tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(-0.180223751574000000)*f[22]; - tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(-0.180223751574000000)*g[22]; - y[5] += tf*g[5]+tg*f[5]; - t = f[5]*g[5]; - y[0] += CONSTANT(0.282094791773999990)*t; - y[6] += CONSTANT(0.090111875786499998)*t; - y[8] += CONSTANT(-0.156078347227999990)*t; - y[20] += CONSTANT(-0.161197023870999990)*t; - y[22] += CONSTANT(-0.180223751574000000)*t; - - // [5,10]: 3,13,15,31,33, - tf = CONSTANT(0.184674390919999990)*f[3]+CONSTANT(0.115164716490000000)*f[13]+CONSTANT(-0.148677009678999990)*f[15]+CONSTANT(-0.083004965974099995)*f[31]+CONSTANT(-0.179311220383999990)*f[33]; - tg = CONSTANT(0.184674390919999990)*g[3]+CONSTANT(0.115164716490000000)*g[13]+CONSTANT(-0.148677009678999990)*g[15]+CONSTANT(-0.083004965974099995)*g[31]+CONSTANT(-0.179311220383999990)*g[33]; - y[5] += tf*g[10]+tg*f[10]; - y[10] += tf*g[5]+tg*f[5]; - t = f[5]*g[10]+f[10]*g[5]; - y[3] += CONSTANT(0.184674390919999990)*t; - y[13] += CONSTANT(0.115164716490000000)*t; - y[15] += CONSTANT(-0.148677009678999990)*t; - y[31] += CONSTANT(-0.083004965974099995)*t; - y[33] += CONSTANT(-0.179311220383999990)*t; - - // [5,11]: 2,12,14,30,32, - tf = CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.059470803871800003)*f[12]+CONSTANT(-0.115164716491000000)*f[14]+CONSTANT(-0.169433177294000010)*f[30]+CONSTANT(-0.173617342585000000)*f[32]; - tg = CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.059470803871800003)*g[12]+CONSTANT(-0.115164716491000000)*g[14]+CONSTANT(-0.169433177294000010)*g[30]+CONSTANT(-0.173617342585000000)*g[32]; - y[5] += tf*g[11]+tg*f[11]; - y[11] += tf*g[5]+tg*f[5]; - t = f[5]*g[11]+f[11]*g[5]; - y[2] += CONSTANT(0.233596680327000010)*t; - y[12] += CONSTANT(0.059470803871800003)*t; - y[14] += CONSTANT(-0.115164716491000000)*t; - y[30] += CONSTANT(-0.169433177294000010)*t; - y[32] += CONSTANT(-0.173617342585000000)*t; - - // [5,14]: 9,1,27,29, - tf = CONSTANT(0.148677009677999990)*f[9]+CONSTANT(-0.184674390923000000)*f[1]+CONSTANT(0.179311220382000010)*f[27]+CONSTANT(0.083004965973399999)*f[29]; - tg = CONSTANT(0.148677009677999990)*g[9]+CONSTANT(-0.184674390923000000)*g[1]+CONSTANT(0.179311220382000010)*g[27]+CONSTANT(0.083004965973399999)*g[29]; - y[5] += tf*g[14]+tg*f[14]; - y[14] += tf*g[5]+tg*f[5]; - t = f[5]*g[14]+f[14]*g[5]; - y[9] += CONSTANT(0.148677009677999990)*t; - y[1] += CONSTANT(-0.184674390923000000)*t; - y[27] += CONSTANT(0.179311220382000010)*t; - y[29] += CONSTANT(0.083004965973399999)*t; - - // [5,17]: 8,22,24, - tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(-0.140463346189000000)*f[24]; - tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(-0.140463346189000000)*g[24]; - y[5] += tf*g[17]+tg*f[17]; - y[17] = tf*g[5]+tg*f[5]; - t = f[5]*g[17]+f[17]*g[5]; - y[8] += CONSTANT(0.168583882832999990)*t; - y[22] += CONSTANT(0.132725386548000010)*t; - y[24] += CONSTANT(-0.140463346189000000)*t; - - // [5,18]: 7,21,23, - tf = CONSTANT(0.180223751571000010)*f[7]+CONSTANT(0.090297865407399994)*f[21]+CONSTANT(-0.132725386549000010)*f[23]; - tg = CONSTANT(0.180223751571000010)*g[7]+CONSTANT(0.090297865407399994)*g[21]+CONSTANT(-0.132725386549000010)*g[23]; - y[5] += tf*g[18]+tg*f[18]; - y[18] += tf*g[5]+tg*f[5]; - t = f[5]*g[18]+f[18]*g[5]; - y[7] += CONSTANT(0.180223751571000010)*t; - y[21] += CONSTANT(0.090297865407399994)*t; - y[23] += CONSTANT(-0.132725386549000010)*t; - - // [5,19]: 6,8,20,22, - tf = CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(-0.090297865408399999)*f[22]; - tg = CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(-0.090297865408399999)*g[22]; - y[5] += tf*g[19]+tg*f[19]; - y[19] += tf*g[5]+tg*f[5]; - t = f[5]*g[19]+f[19]*g[5]; - y[6] += CONSTANT(0.220728115440999990)*t; - y[8] += CONSTANT(0.063718718433900007)*t; - y[20] += CONSTANT(0.044869370061299998)*t; - y[22] += CONSTANT(-0.090297865408399999)*t; - - // [5,26]: 15,33,35, - tf = CONSTANT(0.155288072035000000)*f[15]+CONSTANT(0.138662534056999990)*f[33]+CONSTANT(-0.132882365179999990)*f[35]; - tg = CONSTANT(0.155288072035000000)*g[15]+CONSTANT(0.138662534056999990)*g[33]+CONSTANT(-0.132882365179999990)*g[35]; - y[5] += tf*g[26]+tg*f[26]; - y[26] += tf*g[5]+tg*f[5]; - t = f[5]*g[26]+f[26]*g[5]; - y[15] += CONSTANT(0.155288072035000000)*t; - y[33] += CONSTANT(0.138662534056999990)*t; - y[35] += CONSTANT(-0.132882365179999990)*t; - - // [5,28]: 15,13,31,33, - tf = CONSTANT(0.044827805096399997)*f[15]+CONSTANT(0.173617342584000000)*f[13]+CONSTANT(0.074118242118699995)*f[31]+CONSTANT(-0.114366930522000000)*f[33]; - tg = CONSTANT(0.044827805096399997)*g[15]+CONSTANT(0.173617342584000000)*g[13]+CONSTANT(0.074118242118699995)*g[31]+CONSTANT(-0.114366930522000000)*g[33]; - y[5] += tf*g[28]+tg*f[28]; - y[28] += tf*g[5]+tg*f[5]; - t = f[5]*g[28]+f[28]*g[5]; - y[15] += CONSTANT(0.044827805096399997)*t; - y[13] += CONSTANT(0.173617342584000000)*t; - y[31] += CONSTANT(0.074118242118699995)*t; - y[33] += CONSTANT(-0.114366930522000000)*t; - - // [5,29]: 12,30,32, - tf = CONSTANT(0.214317900578999990)*f[12]+CONSTANT(0.036165998945399999)*f[30]+CONSTANT(-0.074118242119099995)*f[32]; - tg = CONSTANT(0.214317900578999990)*g[12]+CONSTANT(0.036165998945399999)*g[30]+CONSTANT(-0.074118242119099995)*g[32]; - y[5] += tf*g[29]+tg*f[29]; - y[29] += tf*g[5]+tg*f[5]; - t = f[5]*g[29]+f[29]*g[5]; - y[12] += CONSTANT(0.214317900578999990)*t; - y[30] += CONSTANT(0.036165998945399999)*t; - y[32] += CONSTANT(-0.074118242119099995)*t; - - // [5,32]: 9,27, - tf = CONSTANT(-0.044827805096799997)*f[9]+CONSTANT(0.114366930522000000)*f[27]; - tg = CONSTANT(-0.044827805096799997)*g[9]+CONSTANT(0.114366930522000000)*g[27]; - y[5] += tf*g[32]+tg*f[32]; - y[32] += tf*g[5]+tg*f[5]; - t = f[5]*g[32]+f[32]*g[5]; - y[9] += CONSTANT(-0.044827805096799997)*t; - y[27] += CONSTANT(0.114366930522000000)*t; - - // [5,34]: 9,27,25, - tf = CONSTANT(-0.155288072036000010)*f[9]+CONSTANT(-0.138662534059000000)*f[27]+CONSTANT(0.132882365179000010)*f[25]; - tg = CONSTANT(-0.155288072036000010)*g[9]+CONSTANT(-0.138662534059000000)*g[27]+CONSTANT(0.132882365179000010)*g[25]; - y[5] += tf*g[34]+tg*f[34]; - y[34] += tf*g[5]+tg*f[5]; - t = f[5]*g[34]+f[34]*g[5]; - y[9] += CONSTANT(-0.155288072036000010)*t; - y[27] += CONSTANT(-0.138662534059000000)*t; - y[25] += CONSTANT(0.132882365179000010)*t; - - // [6,6]: 0,6,20, - tf = CONSTANT(0.282094797560000000)*f[0]+CONSTANT(0.241795553185999990)*f[20]; - tg = CONSTANT(0.282094797560000000)*g[0]+CONSTANT(0.241795553185999990)*g[20]; - y[6] += tf*g[6]+tg*f[6]; - t = f[6]*g[6]; - y[0] += CONSTANT(0.282094797560000000)*t; - y[6] += CONSTANT(0.180223764527000010)*t; - y[20] += CONSTANT(0.241795553185999990)*t; - - // [7,7]: 6,0,8,20,22, - tf = CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(0.180223751574000000)*f[22]; - tg = CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(0.180223751574000000)*g[22]; - y[7] += tf*g[7]+tg*f[7]; - t = f[7]*g[7]; - y[6] += CONSTANT(0.090111875786499998)*t; - y[0] += CONSTANT(0.282094791773999990)*t; - y[8] += CONSTANT(0.156078347227999990)*t; - y[20] += CONSTANT(-0.161197023870999990)*t; - y[22] += CONSTANT(0.180223751574000000)*t; - - // [7,10]: 9,1,11,27,29, - tf = CONSTANT(0.148677009678999990)*f[9]+CONSTANT(0.184674390919999990)*f[1]+CONSTANT(0.115164716490000000)*f[11]+CONSTANT(0.179311220383999990)*f[27]+CONSTANT(-0.083004965974099995)*f[29]; - tg = CONSTANT(0.148677009678999990)*g[9]+CONSTANT(0.184674390919999990)*g[1]+CONSTANT(0.115164716490000000)*g[11]+CONSTANT(0.179311220383999990)*g[27]+CONSTANT(-0.083004965974099995)*g[29]; - y[7] += tf*g[10]+tg*f[10]; - y[10] += tf*g[7]+tg*f[7]; - t = f[7]*g[10]+f[10]*g[7]; - y[9] += CONSTANT(0.148677009678999990)*t; - y[1] += CONSTANT(0.184674390919999990)*t; - y[11] += CONSTANT(0.115164716490000000)*t; - y[27] += CONSTANT(0.179311220383999990)*t; - y[29] += CONSTANT(-0.083004965974099995)*t; - - // [7,13]: 12,2,14,30,32, - tf = CONSTANT(0.059470803871800003)*f[12]+CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.115164716491000000)*f[14]+CONSTANT(-0.169433177294000010)*f[30]+CONSTANT(0.173617342585000000)*f[32]; - tg = CONSTANT(0.059470803871800003)*g[12]+CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.115164716491000000)*g[14]+CONSTANT(-0.169433177294000010)*g[30]+CONSTANT(0.173617342585000000)*g[32]; - y[7] += tf*g[13]+tg*f[13]; - y[13] += tf*g[7]+tg*f[7]; - t = f[7]*g[13]+f[13]*g[7]; - y[12] += CONSTANT(0.059470803871800003)*t; - y[2] += CONSTANT(0.233596680327000010)*t; - y[14] += CONSTANT(0.115164716491000000)*t; - y[30] += CONSTANT(-0.169433177294000010)*t; - y[32] += CONSTANT(0.173617342585000000)*t; - - // [7,14]: 3,15,31,33, - tf = CONSTANT(0.184674390923000000)*f[3]+CONSTANT(0.148677009677999990)*f[15]+CONSTANT(-0.083004965973399999)*f[31]+CONSTANT(0.179311220382000010)*f[33]; - tg = CONSTANT(0.184674390923000000)*g[3]+CONSTANT(0.148677009677999990)*g[15]+CONSTANT(-0.083004965973399999)*g[31]+CONSTANT(0.179311220382000010)*g[33]; - y[7] += tf*g[14]+tg*f[14]; - y[14] += tf*g[7]+tg*f[7]; - t = f[7]*g[14]+f[14]*g[7]; - y[3] += CONSTANT(0.184674390923000000)*t; - y[15] += CONSTANT(0.148677009677999990)*t; - y[31] += CONSTANT(-0.083004965973399999)*t; - y[33] += CONSTANT(0.179311220382000010)*t; - - // [7,17]: 16,4,18, - tf = CONSTANT(0.140463346187999990)*f[16]+CONSTANT(0.168583882835000000)*f[4]+CONSTANT(0.132725386549000010)*f[18]; - tg = CONSTANT(0.140463346187999990)*g[16]+CONSTANT(0.168583882835000000)*g[4]+CONSTANT(0.132725386549000010)*g[18]; - y[7] += tf*g[17]+tg*f[17]; - y[17] += tf*g[7]+tg*f[7]; - t = f[7]*g[17]+f[17]*g[7]; - y[16] += CONSTANT(0.140463346187999990)*t; - y[4] += CONSTANT(0.168583882835000000)*t; - y[18] += CONSTANT(0.132725386549000010)*t; - - // [7,21]: 8,20,6,22, - tf = CONSTANT(-0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.090297865408399999)*f[22]; - tg = CONSTANT(-0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.090297865408399999)*g[22]; - y[7] += tf*g[21]+tg*f[21]; - y[21] += tf*g[7]+tg*f[7]; - t = f[7]*g[21]+f[21]*g[7]; - y[8] += CONSTANT(-0.063718718433900007)*t; - y[20] += CONSTANT(0.044869370061299998)*t; - y[6] += CONSTANT(0.220728115440999990)*t; - y[22] += CONSTANT(0.090297865408399999)*t; - - // [7,23]: 8,22,24, - tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(0.140463346189000000)*f[24]; - tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(0.140463346189000000)*g[24]; - y[7] += tf*g[23]+tg*f[23]; - y[23] += tf*g[7]+tg*f[7]; - t = f[7]*g[23]+f[23]*g[7]; - y[8] += CONSTANT(0.168583882832999990)*t; - y[22] += CONSTANT(0.132725386548000010)*t; - y[24] += CONSTANT(0.140463346189000000)*t; - - // [7,26]: 9,25,27, - tf = CONSTANT(0.155288072035000000)*f[9]+CONSTANT(0.132882365179999990)*f[25]+CONSTANT(0.138662534056999990)*f[27]; - tg = CONSTANT(0.155288072035000000)*g[9]+CONSTANT(0.132882365179999990)*g[25]+CONSTANT(0.138662534056999990)*g[27]; - y[7] += tf*g[26]+tg*f[26]; - y[26] += tf*g[7]+tg*f[7]; - t = f[7]*g[26]+f[26]*g[7]; - y[9] += CONSTANT(0.155288072035000000)*t; - y[25] += CONSTANT(0.132882365179999990)*t; - y[27] += CONSTANT(0.138662534056999990)*t; - - // [7,28]: 27,11,9,29, - tf = CONSTANT(0.114366930522000000)*f[27]+CONSTANT(0.173617342584000000)*f[11]+CONSTANT(-0.044827805096399997)*f[9]+CONSTANT(0.074118242118699995)*f[29]; - tg = CONSTANT(0.114366930522000000)*g[27]+CONSTANT(0.173617342584000000)*g[11]+CONSTANT(-0.044827805096399997)*g[9]+CONSTANT(0.074118242118699995)*g[29]; - y[7] += tf*g[28]+tg*f[28]; - y[28] += tf*g[7]+tg*f[7]; - t = f[7]*g[28]+f[28]*g[7]; - y[27] += CONSTANT(0.114366930522000000)*t; - y[11] += CONSTANT(0.173617342584000000)*t; - y[9] += CONSTANT(-0.044827805096399997)*t; - y[29] += CONSTANT(0.074118242118699995)*t; - - // [7,31]: 30,12,32, - tf = CONSTANT(0.036165998945399999)*f[30]+CONSTANT(0.214317900578999990)*f[12]+CONSTANT(0.074118242119099995)*f[32]; - tg = CONSTANT(0.036165998945399999)*g[30]+CONSTANT(0.214317900578999990)*g[12]+CONSTANT(0.074118242119099995)*g[32]; - y[7] += tf*g[31]+tg*f[31]; - y[31] += tf*g[7]+tg*f[7]; - t = f[7]*g[31]+f[31]*g[7]; - y[30] += CONSTANT(0.036165998945399999)*t; - y[12] += CONSTANT(0.214317900578999990)*t; - y[32] += CONSTANT(0.074118242119099995)*t; - - // [7,32]: 15,33, - tf = CONSTANT(-0.044827805096799997)*f[15]+CONSTANT(0.114366930522000000)*f[33]; - tg = CONSTANT(-0.044827805096799997)*g[15]+CONSTANT(0.114366930522000000)*g[33]; - y[7] += tf*g[32]+tg*f[32]; - y[32] += tf*g[7]+tg*f[7]; - t = f[7]*g[32]+f[32]*g[7]; - y[15] += CONSTANT(-0.044827805096799997)*t; - y[33] += CONSTANT(0.114366930522000000)*t; - - // [7,34]: 15,33,35, - tf = CONSTANT(0.155288072036000010)*f[15]+CONSTANT(0.138662534059000000)*f[33]+CONSTANT(0.132882365179000010)*f[35]; - tg = CONSTANT(0.155288072036000010)*g[15]+CONSTANT(0.138662534059000000)*g[33]+CONSTANT(0.132882365179000010)*g[35]; - y[7] += tf*g[34]+tg*f[34]; - y[34] += tf*g[7]+tg*f[7]; - t = f[7]*g[34]+f[34]*g[7]; - y[15] += CONSTANT(0.155288072036000010)*t; - y[33] += CONSTANT(0.138662534059000000)*t; - y[35] += CONSTANT(0.132882365179000010)*t; - - // [8,8]: 0,6,20,24, - tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(0.238413613505999990)*f[24]; - tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(0.238413613505999990)*g[24]; - y[8] += tf*g[8]+tg*f[8]; - t = f[8]*g[8]; - y[0] += CONSTANT(0.282094791770000020)*t; - y[6] += CONSTANT(-0.180223751576000010)*t; - y[20] += CONSTANT(0.040299255967500003)*t; - y[24] += CONSTANT(0.238413613505999990)*t; - - // [8,9]: 1,11,25,29, - tf = CONSTANT(0.226179013155000000)*f[1]+CONSTANT(-0.094031597259499999)*f[11]+CONSTANT(0.245532000541000000)*f[25]+CONSTANT(0.016943317729199998)*f[29]; - tg = CONSTANT(0.226179013155000000)*g[1]+CONSTANT(-0.094031597259499999)*g[11]+CONSTANT(0.245532000541000000)*g[25]+CONSTANT(0.016943317729199998)*g[29]; - y[8] += tf*g[9]+tg*f[9]; - y[9] += tf*g[8]+tg*f[8]; - t = f[8]*g[9]+f[9]*g[8]; - y[1] += CONSTANT(0.226179013155000000)*t; - y[11] += CONSTANT(-0.094031597259499999)*t; - y[25] += CONSTANT(0.245532000541000000)*t; - y[29] += CONSTANT(0.016943317729199998)*t; - - // [8,14]: 2,12,30,34, - tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12]+CONSTANT(0.053579475144400000)*f[30]+CONSTANT(0.190188269816000010)*f[34]; - tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12]+CONSTANT(0.053579475144400000)*g[30]+CONSTANT(0.190188269816000010)*g[34]; - y[8] += tf*g[14]+tg*f[14]; - y[14] += tf*g[8]+tg*f[8]; - t = f[8]*g[14]+f[14]*g[8]; - y[2] += CONSTANT(0.184674390919999990)*t; - y[12] += CONSTANT(-0.188063194517999990)*t; - y[30] += CONSTANT(0.053579475144400000)*t; - y[34] += CONSTANT(0.190188269816000010)*t; - - // [8,15]: 13,3,31,35, - tf = CONSTANT(-0.094031597259499999)*f[13]+CONSTANT(0.226179013155000000)*f[3]+CONSTANT(0.016943317729199998)*f[31]+CONSTANT(0.245532000541000000)*f[35]; - tg = CONSTANT(-0.094031597259499999)*g[13]+CONSTANT(0.226179013155000000)*g[3]+CONSTANT(0.016943317729199998)*g[31]+CONSTANT(0.245532000541000000)*g[35]; - y[8] += tf*g[15]+tg*f[15]; - y[15] += tf*g[8]+tg*f[8]; - t = f[8]*g[15]+f[15]*g[8]; - y[13] += CONSTANT(-0.094031597259499999)*t; - y[3] += CONSTANT(0.226179013155000000)*t; - y[31] += CONSTANT(0.016943317729199998)*t; - y[35] += CONSTANT(0.245532000541000000)*t; - - // [8,22]: 6,20,24, - tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(-0.075080816691500005)*f[24]; - tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(-0.075080816691500005)*g[24]; - y[8] += tf*g[22]+tg*f[22]; - y[22] += tf*g[8]+tg*f[8]; - t = f[8]*g[22]+f[22]*g[8]; - y[6] += CONSTANT(0.156078347226000000)*t; - y[20] += CONSTANT(-0.190364615029000010)*t; - y[24] += CONSTANT(-0.075080816691500005)*t; - - // [8,26]: 10,28, - tf = CONSTANT(0.190188269806999990)*f[10]+CONSTANT(-0.097043558542400002)*f[28]; - tg = CONSTANT(0.190188269806999990)*g[10]+CONSTANT(-0.097043558542400002)*g[28]; - y[8] += tf*g[26]+tg*f[26]; - y[26] += tf*g[8]+tg*f[8]; - t = f[8]*g[26]+f[26]*g[8]; - y[10] += CONSTANT(0.190188269806999990)*t; - y[28] += CONSTANT(-0.097043558542400002)*t; - - // [8,27]: 25,11,29, - tf = CONSTANT(-0.062641347680800000)*f[25]+CONSTANT(0.141757966609000000)*f[11]+CONSTANT(-0.121034582550000010)*f[29]; - tg = CONSTANT(-0.062641347680800000)*g[25]+CONSTANT(0.141757966609000000)*g[11]+CONSTANT(-0.121034582550000010)*g[29]; - y[8] += tf*g[27]+tg*f[27]; - y[27] += tf*g[8]+tg*f[8]; - t = f[8]*g[27]+f[27]*g[8]; - y[25] += CONSTANT(-0.062641347680800000)*t; - y[11] += CONSTANT(0.141757966609000000)*t; - y[29] += CONSTANT(-0.121034582550000010)*t; - - // [8,32]: 30,12,34, - tf = CONSTANT(-0.191372478254000000)*f[30]+CONSTANT(0.141757966609000000)*f[12]+CONSTANT(-0.097043558538899996)*f[34]; - tg = CONSTANT(-0.191372478254000000)*g[30]+CONSTANT(0.141757966609000000)*g[12]+CONSTANT(-0.097043558538899996)*g[34]; - y[8] += tf*g[32]+tg*f[32]; - y[32] += tf*g[8]+tg*f[8]; - t = f[8]*g[32]+f[32]*g[8]; - y[30] += CONSTANT(-0.191372478254000000)*t; - y[12] += CONSTANT(0.141757966609000000)*t; - y[34] += CONSTANT(-0.097043558538899996)*t; - - // [8,33]: 13,31,35, - tf = CONSTANT(0.141757966609000000)*f[13]+CONSTANT(-0.121034582550000010)*f[31]+CONSTANT(-0.062641347680800000)*f[35]; - tg = CONSTANT(0.141757966609000000)*g[13]+CONSTANT(-0.121034582550000010)*g[31]+CONSTANT(-0.062641347680800000)*g[35]; - y[8] += tf*g[33]+tg*f[33]; - y[33] += tf*g[8]+tg*f[8]; - t = f[8]*g[33]+f[33]*g[8]; - y[13] += CONSTANT(0.141757966609000000)*t; - y[31] += CONSTANT(-0.121034582550000010)*t; - y[35] += CONSTANT(-0.062641347680800000)*t; - - // [9,9]: 6,0,20, - tf = CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.282094791766999970)*f[0]+CONSTANT(0.076934943209800002)*f[20]; - tg = CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.282094791766999970)*g[0]+CONSTANT(0.076934943209800002)*g[20]; - y[9] += tf*g[9]+tg*f[9]; - t = f[9]*g[9]; - y[6] += CONSTANT(-0.210261043508000010)*t; - y[0] += CONSTANT(0.282094791766999970)*t; - y[20] += CONSTANT(0.076934943209800002)*t; - - // [9,17]: 2,12,30, - tf = CONSTANT(0.162867503964999990)*f[2]+CONSTANT(-0.203550726872999990)*f[12]+CONSTANT(0.098140130728100003)*f[30]; - tg = CONSTANT(0.162867503964999990)*g[2]+CONSTANT(-0.203550726872999990)*g[12]+CONSTANT(0.098140130728100003)*g[30]; - y[9] += tf*g[17]+tg*f[17]; - y[17] += tf*g[9]+tg*f[9]; - t = f[9]*g[17]+f[17]*g[9]; - y[2] += CONSTANT(0.162867503964999990)*t; - y[12] += CONSTANT(-0.203550726872999990)*t; - y[30] += CONSTANT(0.098140130728100003)*t; - - // [9,18]: 3,13,31,35, - tf = CONSTANT(-0.043528171377799997)*f[3]+CONSTANT(0.133255230519000010)*f[13]+CONSTANT(-0.101584686310000010)*f[31]+CONSTANT(0.098140130731999994)*f[35]; - tg = CONSTANT(-0.043528171377799997)*g[3]+CONSTANT(0.133255230519000010)*g[13]+CONSTANT(-0.101584686310000010)*g[31]+CONSTANT(0.098140130731999994)*g[35]; - y[9] += tf*g[18]+tg*f[18]; - y[18] += tf*g[9]+tg*f[9]; - t = f[9]*g[18]+f[18]*g[9]; - y[3] += CONSTANT(-0.043528171377799997)*t; - y[13] += CONSTANT(0.133255230519000010)*t; - y[31] += CONSTANT(-0.101584686310000010)*t; - y[35] += CONSTANT(0.098140130731999994)*t; - - // [9,19]: 14,32,34, - tf = CONSTANT(-0.099322584600699995)*f[14]+CONSTANT(0.126698363970000010)*f[32]+CONSTANT(0.131668802180999990)*f[34]; - tg = CONSTANT(-0.099322584600699995)*g[14]+CONSTANT(0.126698363970000010)*g[32]+CONSTANT(0.131668802180999990)*g[34]; - y[9] += tf*g[19]+tg*f[19]; - y[19] += tf*g[9]+tg*f[9]; - t = f[9]*g[19]+f[19]*g[9]; - y[14] += CONSTANT(-0.099322584600699995)*t; - y[32] += CONSTANT(0.126698363970000010)*t; - y[34] += CONSTANT(0.131668802180999990)*t; - - // [9,22]: 1,11,25,29, - tf = CONSTANT(-0.043528171378199997)*f[1]+CONSTANT(0.133255230518000010)*f[11]+CONSTANT(-0.098140130732499997)*f[25]+CONSTANT(-0.101584686311000000)*f[29]; - tg = CONSTANT(-0.043528171378199997)*g[1]+CONSTANT(0.133255230518000010)*g[11]+CONSTANT(-0.098140130732499997)*g[25]+CONSTANT(-0.101584686311000000)*g[29]; - y[9] += tf*g[22]+tg*f[22]; - y[22] += tf*g[9]+tg*f[9]; - t = f[9]*g[22]+f[22]*g[9]; - y[1] += CONSTANT(-0.043528171378199997)*t; - y[11] += CONSTANT(0.133255230518000010)*t; - y[25] += CONSTANT(-0.098140130732499997)*t; - y[29] += CONSTANT(-0.101584686311000000)*t; - - // [9,27]: 6,20, - tf = CONSTANT(0.126792179874999990)*f[6]+CONSTANT(-0.196280261464999990)*f[20]; - tg = CONSTANT(0.126792179874999990)*g[6]+CONSTANT(-0.196280261464999990)*g[20]; - y[9] += tf*g[27]+tg*f[27]; - y[27] += tf*g[9]+tg*f[9]; - t = f[9]*g[27]+f[27]*g[9]; - y[6] += CONSTANT(0.126792179874999990)*t; - y[20] += CONSTANT(-0.196280261464999990)*t; - - // [10,10]: 0,20,24, - tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(-0.151717754049000010)*f[24]; - tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(-0.151717754049000010)*g[24]; - y[10] += tf*g[10]+tg*f[10]; - t = f[10]*g[10]; - y[0] += CONSTANT(0.282094791771999980)*t; - y[20] += CONSTANT(-0.179514867494000000)*t; - y[24] += CONSTANT(-0.151717754049000010)*t; - - // [10,16]: 14,32, - tf = CONSTANT(0.151717754044999990)*f[14]+CONSTANT(-0.077413979111300005)*f[32]; - tg = CONSTANT(0.151717754044999990)*g[14]+CONSTANT(-0.077413979111300005)*g[32]; - y[10] += tf*g[16]+tg*f[16]; - y[16] += tf*g[10]+tg*f[10]; - t = f[10]*g[16]+f[16]*g[10]; - y[14] += CONSTANT(0.151717754044999990)*t; - y[32] += CONSTANT(-0.077413979111300005)*t; - - // [10,17]: 13,3,31,35, - tf = CONSTANT(0.067850242288900006)*f[13]+CONSTANT(0.199471140200000010)*f[3]+CONSTANT(-0.113793659091000000)*f[31]+CONSTANT(-0.149911525925999990)*f[35]; - tg = CONSTANT(0.067850242288900006)*g[13]+CONSTANT(0.199471140200000010)*g[3]+CONSTANT(-0.113793659091000000)*g[31]+CONSTANT(-0.149911525925999990)*g[35]; - y[10] += tf*g[17]+tg*f[17]; - y[17] += tf*g[10]+tg*f[10]; - t = f[10]*g[17]+f[17]*g[10]; - y[13] += CONSTANT(0.067850242288900006)*t; - y[3] += CONSTANT(0.199471140200000010)*t; - y[31] += CONSTANT(-0.113793659091000000)*t; - y[35] += CONSTANT(-0.149911525925999990)*t; - - // [10,18]: 12,2,30,34, - tf = CONSTANT(-0.044418410173299998)*f[12]+CONSTANT(0.213243618621000000)*f[2]+CONSTANT(-0.171327458205000000)*f[30]+CONSTANT(-0.101358691177000000)*f[34]; - tg = CONSTANT(-0.044418410173299998)*g[12]+CONSTANT(0.213243618621000000)*g[2]+CONSTANT(-0.171327458205000000)*g[30]+CONSTANT(-0.101358691177000000)*g[34]; - y[10] += tf*g[18]+tg*f[18]; - y[18] += tf*g[10]+tg*f[10]; - t = f[10]*g[18]+f[18]*g[10]; - y[12] += CONSTANT(-0.044418410173299998)*t; - y[2] += CONSTANT(0.213243618621000000)*t; - y[30] += CONSTANT(-0.171327458205000000)*t; - y[34] += CONSTANT(-0.101358691177000000)*t; - - // [10,19]: 3,15,13,31,33, - tf = CONSTANT(-0.075393004386799994)*f[3]+CONSTANT(0.099322584599600000)*f[15]+CONSTANT(0.102579924281000000)*f[13]+CONSTANT(0.097749909976500002)*f[31]+CONSTANT(-0.025339672794100002)*f[33]; - tg = CONSTANT(-0.075393004386799994)*g[3]+CONSTANT(0.099322584599600000)*g[15]+CONSTANT(0.102579924281000000)*g[13]+CONSTANT(0.097749909976500002)*g[31]+CONSTANT(-0.025339672794100002)*g[33]; - y[10] += tf*g[19]+tg*f[19]; - y[19] += tf*g[10]+tg*f[10]; - t = f[10]*g[19]+f[19]*g[10]; - y[3] += CONSTANT(-0.075393004386799994)*t; - y[15] += CONSTANT(0.099322584599600000)*t; - y[13] += CONSTANT(0.102579924281000000)*t; - y[31] += CONSTANT(0.097749909976500002)*t; - y[33] += CONSTANT(-0.025339672794100002)*t; - - // [10,21]: 11,1,9,27,29, - tf = CONSTANT(0.102579924281000000)*f[11]+CONSTANT(-0.075393004386799994)*f[1]+CONSTANT(-0.099322584599600000)*f[9]+CONSTANT(0.025339672794100002)*f[27]+CONSTANT(0.097749909976500002)*f[29]; - tg = CONSTANT(0.102579924281000000)*g[11]+CONSTANT(-0.075393004386799994)*g[1]+CONSTANT(-0.099322584599600000)*g[9]+CONSTANT(0.025339672794100002)*g[27]+CONSTANT(0.097749909976500002)*g[29]; - y[10] += tf*g[21]+tg*f[21]; - y[21] += tf*g[10]+tg*f[10]; - t = f[10]*g[21]+f[21]*g[10]; - y[11] += CONSTANT(0.102579924281000000)*t; - y[1] += CONSTANT(-0.075393004386799994)*t; - y[9] += CONSTANT(-0.099322584599600000)*t; - y[27] += CONSTANT(0.025339672794100002)*t; - y[29] += CONSTANT(0.097749909976500002)*t; - - // [10,23]: 11,1,25,29, - tf = CONSTANT(-0.067850242288900006)*f[11]+CONSTANT(-0.199471140200000010)*f[1]+CONSTANT(0.149911525925999990)*f[25]+CONSTANT(0.113793659091000000)*f[29]; - tg = CONSTANT(-0.067850242288900006)*g[11]+CONSTANT(-0.199471140200000010)*g[1]+CONSTANT(0.149911525925999990)*g[25]+CONSTANT(0.113793659091000000)*g[29]; - y[10] += tf*g[23]+tg*f[23]; - y[23] += tf*g[10]+tg*f[10]; - t = f[10]*g[23]+f[23]*g[10]; - y[11] += CONSTANT(-0.067850242288900006)*t; - y[1] += CONSTANT(-0.199471140200000010)*t; - y[25] += CONSTANT(0.149911525925999990)*t; - y[29] += CONSTANT(0.113793659091000000)*t; - - // [10,28]: 6,20,24, - tf = CONSTANT(0.190188269814000000)*f[6]+CONSTANT(-0.065426753820500005)*f[20]+CONSTANT(0.077413979109600004)*f[24]; - tg = CONSTANT(0.190188269814000000)*g[6]+CONSTANT(-0.065426753820500005)*g[20]+CONSTANT(0.077413979109600004)*g[24]; - y[10] += tf*g[28]+tg*f[28]; - y[28] += tf*g[10]+tg*f[10]; - t = f[10]*g[28]+f[28]*g[10]; - y[6] += CONSTANT(0.190188269814000000)*t; - y[20] += CONSTANT(-0.065426753820500005)*t; - y[24] += CONSTANT(0.077413979109600004)*t; - - // [11,11]: 0,6,8,20,22, - tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(-0.145673124078999990)*f[8]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(-0.114687841910000000)*f[22]; - tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(-0.145673124078999990)*g[8]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(-0.114687841910000000)*g[22]; - y[11] += tf*g[11]+tg*f[11]; - t = f[11]*g[11]; - y[0] += CONSTANT(0.282094791773999990)*t; - y[6] += CONSTANT(0.126156626101000010)*t; - y[8] += CONSTANT(-0.145673124078999990)*t; - y[20] += CONSTANT(0.025644981070299999)*t; - y[22] += CONSTANT(-0.114687841910000000)*t; - - // [11,16]: 15,33,35, - tf = CONSTANT(-0.117520066953000000)*f[15]+CONSTANT(0.119929220739999990)*f[33]+CONSTANT(0.134084945035999990)*f[35]; - tg = CONSTANT(-0.117520066953000000)*g[15]+CONSTANT(0.119929220739999990)*g[33]+CONSTANT(0.134084945035999990)*g[35]; - y[11] += tf*g[16]+tg*f[16]; - y[16] += tf*g[11]+tg*f[11]; - t = f[11]*g[16]+f[16]*g[11]; - y[15] += CONSTANT(-0.117520066953000000)*t; - y[33] += CONSTANT(0.119929220739999990)*t; - y[35] += CONSTANT(0.134084945035999990)*t; - - // [11,18]: 3,13,15,31,33, - tf = CONSTANT(0.168583882834000000)*f[3]+CONSTANT(0.114687841909000000)*f[13]+CONSTANT(-0.133255230519000010)*f[15]+CONSTANT(0.075189952564900006)*f[31]+CONSTANT(-0.101990215611000000)*f[33]; - tg = CONSTANT(0.168583882834000000)*g[3]+CONSTANT(0.114687841909000000)*g[13]+CONSTANT(-0.133255230519000010)*g[15]+CONSTANT(0.075189952564900006)*g[31]+CONSTANT(-0.101990215611000000)*g[33]; - y[11] += tf*g[18]+tg*f[18]; - y[18] += tf*g[11]+tg*f[11]; - t = f[11]*g[18]+f[18]*g[11]; - y[3] += CONSTANT(0.168583882834000000)*t; - y[13] += CONSTANT(0.114687841909000000)*t; - y[15] += CONSTANT(-0.133255230519000010)*t; - y[31] += CONSTANT(0.075189952564900006)*t; - y[33] += CONSTANT(-0.101990215611000000)*t; - - // [11,19]: 2,14,12,30,32, - tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(-0.102579924282000000)*f[14]+CONSTANT(0.099322584599300004)*f[12]+CONSTANT(0.009577496073830001)*f[30]+CONSTANT(-0.104682806112000000)*f[32]; - tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(-0.102579924282000000)*g[14]+CONSTANT(0.099322584599300004)*g[12]+CONSTANT(0.009577496073830001)*g[30]+CONSTANT(-0.104682806112000000)*g[32]; - y[11] += tf*g[19]+tg*f[19]; - y[19] += tf*g[11]+tg*f[11]; - t = f[11]*g[19]+f[19]*g[11]; - y[2] += CONSTANT(0.238413613504000000)*t; - y[14] += CONSTANT(-0.102579924282000000)*t; - y[12] += CONSTANT(0.099322584599300004)*t; - y[30] += CONSTANT(0.009577496073830001)*t; - y[32] += CONSTANT(-0.104682806112000000)*t; - - // [11,24]: 9,25,27, - tf = CONSTANT(0.117520066950999990)*f[9]+CONSTANT(-0.134084945037000000)*f[25]+CONSTANT(-0.119929220742000010)*f[27]; - tg = CONSTANT(0.117520066950999990)*g[9]+CONSTANT(-0.134084945037000000)*g[25]+CONSTANT(-0.119929220742000010)*g[27]; - y[11] += tf*g[24]+tg*f[24]; - y[24] += tf*g[11]+tg*f[11]; - t = f[11]*g[24]+f[24]*g[11]; - y[9] += CONSTANT(0.117520066950999990)*t; - y[25] += CONSTANT(-0.134084945037000000)*t; - y[27] += CONSTANT(-0.119929220742000010)*t; - - // [11,29]: 6,20,22,8, - tf = CONSTANT(0.227318461243000010)*f[6]+CONSTANT(0.086019920779800002)*f[20]+CONSTANT(-0.075189952565200002)*f[22]+CONSTANT(0.065621187395299999)*f[8]; - tg = CONSTANT(0.227318461243000010)*g[6]+CONSTANT(0.086019920779800002)*g[20]+CONSTANT(-0.075189952565200002)*g[22]+CONSTANT(0.065621187395299999)*g[8]; - y[11] += tf*g[29]+tg*f[29]; - y[29] += tf*g[11]+tg*f[11]; - t = f[11]*g[29]+f[29]*g[11]; - y[6] += CONSTANT(0.227318461243000010)*t; - y[20] += CONSTANT(0.086019920779800002)*t; - y[22] += CONSTANT(-0.075189952565200002)*t; - y[8] += CONSTANT(0.065621187395299999)*t; - - // [12,12]: 0,6,20, - tf = CONSTANT(0.282094799871999980)*f[0]+CONSTANT(0.168208852954000010)*f[6]+CONSTANT(0.153869910786000010)*f[20]; - tg = CONSTANT(0.282094799871999980)*g[0]+CONSTANT(0.168208852954000010)*g[6]+CONSTANT(0.153869910786000010)*g[20]; - y[12] += tf*g[12]+tg*f[12]; - t = f[12]*g[12]; - y[0] += CONSTANT(0.282094799871999980)*t; - y[6] += CONSTANT(0.168208852954000010)*t; - y[20] += CONSTANT(0.153869910786000010)*t; - - // [12,30]: 20,6, - tf = CONSTANT(0.148373961712999990)*f[20]+CONSTANT(0.239614719999000000)*f[6]; - tg = CONSTANT(0.148373961712999990)*g[20]+CONSTANT(0.239614719999000000)*g[6]; - y[12] += tf*g[30]+tg*f[30]; - y[30] += tf*g[12]+tg*f[12]; - t = f[12]*g[30]+f[30]*g[12]; - y[20] += CONSTANT(0.148373961712999990)*t; - y[6] += CONSTANT(0.239614719999000000)*t; - - // [13,13]: 0,8,6,20,22, - tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.145673124078999990)*f[8]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(0.114687841910000000)*f[22]; - tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.145673124078999990)*g[8]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(0.114687841910000000)*g[22]; - y[13] += tf*g[13]+tg*f[13]; - t = f[13]*g[13]; - y[0] += CONSTANT(0.282094791773999990)*t; - y[8] += CONSTANT(0.145673124078999990)*t; - y[6] += CONSTANT(0.126156626101000010)*t; - y[20] += CONSTANT(0.025644981070299999)*t; - y[22] += CONSTANT(0.114687841910000000)*t; - - // [13,16]: 9,25,27, - tf = CONSTANT(-0.117520066953000000)*f[9]+CONSTANT(-0.134084945035999990)*f[25]+CONSTANT(0.119929220739999990)*f[27]; - tg = CONSTANT(-0.117520066953000000)*g[9]+CONSTANT(-0.134084945035999990)*g[25]+CONSTANT(0.119929220739999990)*g[27]; - y[13] += tf*g[16]+tg*f[16]; - y[16] += tf*g[13]+tg*f[13]; - t = f[13]*g[16]+f[16]*g[13]; - y[9] += CONSTANT(-0.117520066953000000)*t; - y[25] += CONSTANT(-0.134084945035999990)*t; - y[27] += CONSTANT(0.119929220739999990)*t; - - // [13,21]: 2,12,14,30,32, - tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(0.099322584599300004)*f[12]+CONSTANT(0.102579924282000000)*f[14]+CONSTANT(0.009577496073830001)*f[30]+CONSTANT(0.104682806112000000)*f[32]; - tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(0.099322584599300004)*g[12]+CONSTANT(0.102579924282000000)*g[14]+CONSTANT(0.009577496073830001)*g[30]+CONSTANT(0.104682806112000000)*g[32]; - y[13] += tf*g[21]+tg*f[21]; - y[21] += tf*g[13]+tg*f[13]; - t = f[13]*g[21]+f[21]*g[13]; - y[2] += CONSTANT(0.238413613504000000)*t; - y[12] += CONSTANT(0.099322584599300004)*t; - y[14] += CONSTANT(0.102579924282000000)*t; - y[30] += CONSTANT(0.009577496073830001)*t; - y[32] += CONSTANT(0.104682806112000000)*t; - - // [13,24]: 15,33,35, - tf = CONSTANT(-0.117520066950999990)*f[15]+CONSTANT(0.119929220742000010)*f[33]+CONSTANT(-0.134084945037000000)*f[35]; - tg = CONSTANT(-0.117520066950999990)*g[15]+CONSTANT(0.119929220742000010)*g[33]+CONSTANT(-0.134084945037000000)*g[35]; - y[13] += tf*g[24]+tg*f[24]; - y[24] += tf*g[13]+tg*f[13]; - t = f[13]*g[24]+f[24]*g[13]; - y[15] += CONSTANT(-0.117520066950999990)*t; - y[33] += CONSTANT(0.119929220742000010)*t; - y[35] += CONSTANT(-0.134084945037000000)*t; - - // [13,31]: 6,22,20,8, - tf = CONSTANT(0.227318461243000010)*f[6]+CONSTANT(0.075189952565200002)*f[22]+CONSTANT(0.086019920779800002)*f[20]+CONSTANT(-0.065621187395299999)*f[8]; - tg = CONSTANT(0.227318461243000010)*g[6]+CONSTANT(0.075189952565200002)*g[22]+CONSTANT(0.086019920779800002)*g[20]+CONSTANT(-0.065621187395299999)*g[8]; - y[13] += tf*g[31]+tg*f[31]; - y[31] += tf*g[13]+tg*f[13]; - t = f[13]*g[31]+f[31]*g[13]; - y[6] += CONSTANT(0.227318461243000010)*t; - y[22] += CONSTANT(0.075189952565200002)*t; - y[20] += CONSTANT(0.086019920779800002)*t; - y[8] += CONSTANT(-0.065621187395299999)*t; - - // [14,14]: 0,20,24, - tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(0.151717754049000010)*f[24]; - tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(0.151717754049000010)*g[24]; - y[14] += tf*g[14]+tg*f[14]; - t = f[14]*g[14]; - y[0] += CONSTANT(0.282094791771999980)*t; - y[20] += CONSTANT(-0.179514867494000000)*t; - y[24] += CONSTANT(0.151717754049000010)*t; - - // [14,17]: 11,1,25,29, - tf = CONSTANT(0.067850242288500007)*f[11]+CONSTANT(0.199471140196999990)*f[1]+CONSTANT(0.149911525925999990)*f[25]+CONSTANT(-0.113793659092000000)*f[29]; - tg = CONSTANT(0.067850242288500007)*g[11]+CONSTANT(0.199471140196999990)*g[1]+CONSTANT(0.149911525925999990)*g[25]+CONSTANT(-0.113793659092000000)*g[29]; - y[14] += tf*g[17]+tg*f[17]; - y[17] += tf*g[14]+tg*f[14]; - t = f[14]*g[17]+f[17]*g[14]; - y[11] += CONSTANT(0.067850242288500007)*t; - y[1] += CONSTANT(0.199471140196999990)*t; - y[25] += CONSTANT(0.149911525925999990)*t; - y[29] += CONSTANT(-0.113793659092000000)*t; - - // [14,22]: 12,2,30,34, - tf = CONSTANT(-0.044418410173299998)*f[12]+CONSTANT(0.213243618621000000)*f[2]+CONSTANT(-0.171327458205000000)*f[30]+CONSTANT(0.101358691177000000)*f[34]; - tg = CONSTANT(-0.044418410173299998)*g[12]+CONSTANT(0.213243618621000000)*g[2]+CONSTANT(-0.171327458205000000)*g[30]+CONSTANT(0.101358691177000000)*g[34]; - y[14] += tf*g[22]+tg*f[22]; - y[22] += tf*g[14]+tg*f[14]; - t = f[14]*g[22]+f[22]*g[14]; - y[12] += CONSTANT(-0.044418410173299998)*t; - y[2] += CONSTANT(0.213243618621000000)*t; - y[30] += CONSTANT(-0.171327458205000000)*t; - y[34] += CONSTANT(0.101358691177000000)*t; - - // [14,23]: 13,3,31,35, - tf = CONSTANT(0.067850242288500007)*f[13]+CONSTANT(0.199471140196999990)*f[3]+CONSTANT(-0.113793659092000000)*f[31]+CONSTANT(0.149911525925999990)*f[35]; - tg = CONSTANT(0.067850242288500007)*g[13]+CONSTANT(0.199471140196999990)*g[3]+CONSTANT(-0.113793659092000000)*g[31]+CONSTANT(0.149911525925999990)*g[35]; - y[14] += tf*g[23]+tg*f[23]; - y[23] += tf*g[14]+tg*f[14]; - t = f[14]*g[23]+f[23]*g[14]; - y[13] += CONSTANT(0.067850242288500007)*t; - y[3] += CONSTANT(0.199471140196999990)*t; - y[31] += CONSTANT(-0.113793659092000000)*t; - y[35] += CONSTANT(0.149911525925999990)*t; - - // [14,32]: 20,6,24, - tf = CONSTANT(-0.065426753820500005)*f[20]+CONSTANT(0.190188269814000000)*f[6]+CONSTANT(-0.077413979109600004)*f[24]; - tg = CONSTANT(-0.065426753820500005)*g[20]+CONSTANT(0.190188269814000000)*g[6]+CONSTANT(-0.077413979109600004)*g[24]; - y[14] += tf*g[32]+tg*f[32]; - y[32] += tf*g[14]+tg*f[14]; - t = f[14]*g[32]+f[32]*g[14]; - y[20] += CONSTANT(-0.065426753820500005)*t; - y[6] += CONSTANT(0.190188269814000000)*t; - y[24] += CONSTANT(-0.077413979109600004)*t; - - // [15,15]: 0,6,20, - tf = CONSTANT(0.282094791766999970)*f[0]+CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.076934943209800002)*f[20]; - tg = CONSTANT(0.282094791766999970)*g[0]+CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.076934943209800002)*g[20]; - y[15] += tf*g[15]+tg*f[15]; - t = f[15]*g[15]; - y[0] += CONSTANT(0.282094791766999970)*t; - y[6] += CONSTANT(-0.210261043508000010)*t; - y[20] += CONSTANT(0.076934943209800002)*t; - - // [15,21]: 14,32,34, - tf = CONSTANT(-0.099322584600699995)*f[14]+CONSTANT(0.126698363970000010)*f[32]+CONSTANT(-0.131668802180999990)*f[34]; - tg = CONSTANT(-0.099322584600699995)*g[14]+CONSTANT(0.126698363970000010)*g[32]+CONSTANT(-0.131668802180999990)*g[34]; - y[15] += tf*g[21]+tg*f[21]; - y[21] += tf*g[15]+tg*f[15]; - t = f[15]*g[21]+f[21]*g[15]; - y[14] += CONSTANT(-0.099322584600699995)*t; - y[32] += CONSTANT(0.126698363970000010)*t; - y[34] += CONSTANT(-0.131668802180999990)*t; - - // [15,22]: 13,3,31,35, - tf = CONSTANT(0.133255230518000010)*f[13]+CONSTANT(-0.043528171378199997)*f[3]+CONSTANT(-0.101584686311000000)*f[31]+CONSTANT(-0.098140130732499997)*f[35]; - tg = CONSTANT(0.133255230518000010)*g[13]+CONSTANT(-0.043528171378199997)*g[3]+CONSTANT(-0.101584686311000000)*g[31]+CONSTANT(-0.098140130732499997)*g[35]; - y[15] += tf*g[22]+tg*f[22]; - y[22] += tf*g[15]+tg*f[15]; - t = f[15]*g[22]+f[22]*g[15]; - y[13] += CONSTANT(0.133255230518000010)*t; - y[3] += CONSTANT(-0.043528171378199997)*t; - y[31] += CONSTANT(-0.101584686311000000)*t; - y[35] += CONSTANT(-0.098140130732499997)*t; - - // [15,23]: 12,2,30, - tf = CONSTANT(-0.203550726872999990)*f[12]+CONSTANT(0.162867503964999990)*f[2]+CONSTANT(0.098140130728100003)*f[30]; - tg = CONSTANT(-0.203550726872999990)*g[12]+CONSTANT(0.162867503964999990)*g[2]+CONSTANT(0.098140130728100003)*g[30]; - y[15] += tf*g[23]+tg*f[23]; - y[23] += tf*g[15]+tg*f[15]; - t = f[15]*g[23]+f[23]*g[15]; - y[12] += CONSTANT(-0.203550726872999990)*t; - y[2] += CONSTANT(0.162867503964999990)*t; - y[30] += CONSTANT(0.098140130728100003)*t; - - // [15,33]: 6,20, - tf = CONSTANT(0.126792179874999990)*f[6]+CONSTANT(-0.196280261464999990)*f[20]; - tg = CONSTANT(0.126792179874999990)*g[6]+CONSTANT(-0.196280261464999990)*g[20]; - y[15] += tf*g[33]+tg*f[33]; - y[33] += tf*g[15]+tg*f[15]; - t = f[15]*g[33]+f[33]*g[15]; - y[6] += CONSTANT(0.126792179874999990)*t; - y[20] += CONSTANT(-0.196280261464999990)*t; - - // [16,16]: 0,6,20, - tf = CONSTANT(0.282094791763999990)*f[0]+CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.106525305981000000)*f[20]; - tg = CONSTANT(0.282094791763999990)*g[0]+CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.106525305981000000)*g[20]; - y[16] += tf*g[16]+tg*f[16]; - t = f[16]*g[16]; - y[0] += CONSTANT(0.282094791763999990)*t; - y[6] += CONSTANT(-0.229375683829000000)*t; - y[20] += CONSTANT(0.106525305981000000)*t; - - // [16,18]: 8,22, - tf = CONSTANT(-0.075080816693699995)*f[8]+CONSTANT(0.135045473380000000)*f[22]; - tg = CONSTANT(-0.075080816693699995)*g[8]+CONSTANT(0.135045473380000000)*g[22]; - y[16] += tf*g[18]+tg*f[18]; - y[18] += tf*g[16]+tg*f[16]; - t = f[16]*g[18]+f[18]*g[16]; - y[8] += CONSTANT(-0.075080816693699995)*t; - y[22] += CONSTANT(0.135045473380000000)*t; - - // [16,23]: 19,5, - tf = CONSTANT(-0.119098912754999990)*f[19]+CONSTANT(0.140463346187999990)*f[5]; - tg = CONSTANT(-0.119098912754999990)*g[19]+CONSTANT(0.140463346187999990)*g[5]; - y[16] += tf*g[23]+tg*f[23]; - y[23] += tf*g[16]+tg*f[16]; - t = f[16]*g[23]+f[23]*g[16]; - y[19] += CONSTANT(-0.119098912754999990)*t; - y[5] += CONSTANT(0.140463346187999990)*t; - - // [16,26]: 12,2,30, - tf = CONSTANT(-0.207723503645000000)*f[12]+CONSTANT(0.147319200325000010)*f[2]+CONSTANT(0.130197596199999990)*f[30]; - tg = CONSTANT(-0.207723503645000000)*g[12]+CONSTANT(0.147319200325000010)*g[2]+CONSTANT(0.130197596199999990)*g[30]; - y[16] += tf*g[26]+tg*f[26]; - y[26] += tf*g[16]+tg*f[16]; - t = f[16]*g[26]+f[26]*g[16]; - y[12] += CONSTANT(-0.207723503645000000)*t; - y[2] += CONSTANT(0.147319200325000010)*t; - y[30] += CONSTANT(0.130197596199999990)*t; - - // [16,28]: 14,32, - tf = CONSTANT(-0.077413979111300005)*f[14]+CONSTANT(0.128376561115000010)*f[32]; - tg = CONSTANT(-0.077413979111300005)*g[14]+CONSTANT(0.128376561115000010)*g[32]; - y[16] += tf*g[28]+tg*f[28]; - y[28] += tf*g[16]+tg*f[16]; - t = f[16]*g[28]+f[28]*g[16]; - y[14] += CONSTANT(-0.077413979111300005)*t; - y[32] += CONSTANT(0.128376561115000010)*t; - - // [16,29]: 15,33,35, - tf = CONSTANT(0.035835708931099997)*f[15]+CONSTANT(-0.118853600623999990)*f[33]+CONSTANT(-0.053152946071899999)*f[35]; - tg = CONSTANT(0.035835708931099997)*g[15]+CONSTANT(-0.118853600623999990)*g[33]+CONSTANT(-0.053152946071899999)*g[35]; - y[16] += tf*g[29]+tg*f[29]; - y[29] += tf*g[16]+tg*f[16]; - t = f[16]*g[29]+f[29]*g[16]; - y[15] += CONSTANT(0.035835708931099997)*t; - y[33] += CONSTANT(-0.118853600623999990)*t; - y[35] += CONSTANT(-0.053152946071899999)*t; - - // [16,31]: 27,9,25, - tf = CONSTANT(-0.118853600623999990)*f[27]+CONSTANT(0.035835708931099997)*f[9]+CONSTANT(0.053152946071899999)*f[25]; - tg = CONSTANT(-0.118853600623999990)*g[27]+CONSTANT(0.035835708931099997)*g[9]+CONSTANT(0.053152946071899999)*g[25]; - y[16] += tf*g[31]+tg*f[31]; - y[31] += tf*g[16]+tg*f[16]; - t = f[16]*g[31]+f[31]*g[16]; - y[27] += CONSTANT(-0.118853600623999990)*t; - y[9] += CONSTANT(0.035835708931099997)*t; - y[25] += CONSTANT(0.053152946071899999)*t; - - // [17,17]: 0,6,20, - tf = CONSTANT(0.282094791768999990)*f[0]+CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20]; - tg = CONSTANT(0.282094791768999990)*g[0]+CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20]; - y[17] += tf*g[17]+tg*f[17]; - t = f[17]*g[17]; - y[0] += CONSTANT(0.282094791768999990)*t; - y[6] += CONSTANT(-0.057343920955899998)*t; - y[20] += CONSTANT(-0.159787958979000000)*t; - - // [17,19]: 8,22,24, - tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(0.119098912753000000)*f[24]; - tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(0.119098912753000000)*g[24]; - y[17] += tf*g[19]+tg*f[19]; - y[19] += tf*g[17]+tg*f[17]; - t = f[17]*g[19]+f[19]*g[17]; - y[8] += CONSTANT(-0.112621225039000000)*t; - y[22] += CONSTANT(0.045015157794100001)*t; - y[24] += CONSTANT(0.119098912753000000)*t; - - // [17,21]: 16,4,18, - tf = CONSTANT(-0.119098912754999990)*f[16]+CONSTANT(-0.112621225039000000)*f[4]+CONSTANT(0.045015157794399997)*f[18]; - tg = CONSTANT(-0.119098912754999990)*g[16]+CONSTANT(-0.112621225039000000)*g[4]+CONSTANT(0.045015157794399997)*g[18]; - y[17] += tf*g[21]+tg*f[21]; - y[21] += tf*g[17]+tg*f[17]; - t = f[17]*g[21]+f[21]*g[17]; - y[16] += CONSTANT(-0.119098912754999990)*t; - y[4] += CONSTANT(-0.112621225039000000)*t; - y[18] += CONSTANT(0.045015157794399997)*t; - - // [17,26]: 3,13,31, - tf = CONSTANT(0.208340811096000000)*f[3]+CONSTANT(0.029982305185199998)*f[13]+CONSTANT(-0.118853600623999990)*f[31]; - tg = CONSTANT(0.208340811096000000)*g[3]+CONSTANT(0.029982305185199998)*g[13]+CONSTANT(-0.118853600623999990)*g[31]; - y[17] += tf*g[26]+tg*f[26]; - y[26] += tf*g[17]+tg*f[17]; - t = f[17]*g[26]+f[26]*g[17]; - y[3] += CONSTANT(0.208340811096000000)*t; - y[13] += CONSTANT(0.029982305185199998)*t; - y[31] += CONSTANT(-0.118853600623999990)*t; - - // [17,27]: 12,2,30, - tf = CONSTANT(-0.103861751821000010)*f[12]+CONSTANT(0.196425600433000000)*f[2]+CONSTANT(-0.130197596204999990)*f[30]; - tg = CONSTANT(-0.103861751821000010)*g[12]+CONSTANT(0.196425600433000000)*g[2]+CONSTANT(-0.130197596204999990)*g[30]; - y[17] += tf*g[27]+tg*f[27]; - y[27] += tf*g[17]+tg*f[17]; - t = f[17]*g[27]+f[27]*g[17]; - y[12] += CONSTANT(-0.103861751821000010)*t; - y[2] += CONSTANT(0.196425600433000000)*t; - y[30] += CONSTANT(-0.130197596204999990)*t; - - // [17,28]: 13,3,31,35, - tf = CONSTANT(0.121172043789000000)*f[13]+CONSTANT(-0.060142811686500000)*f[3]+CONSTANT(0.034310079156700000)*f[31]+CONSTANT(0.099440056652200001)*f[35]; - tg = CONSTANT(0.121172043789000000)*g[13]+CONSTANT(-0.060142811686500000)*g[3]+CONSTANT(0.034310079156700000)*g[31]+CONSTANT(0.099440056652200001)*g[35]; - y[17] += tf*g[28]+tg*f[28]; - y[28] += tf*g[17]+tg*f[17]; - t = f[17]*g[28]+f[28]*g[17]; - y[13] += CONSTANT(0.121172043789000000)*t; - y[3] += CONSTANT(-0.060142811686500000)*t; - y[31] += CONSTANT(0.034310079156700000)*t; - y[35] += CONSTANT(0.099440056652200001)*t; - - // [17,32]: 11,1,25,29, - tf = CONSTANT(0.121172043788000010)*f[11]+CONSTANT(-0.060142811686900000)*f[1]+CONSTANT(-0.099440056652700004)*f[25]+CONSTANT(0.034310079156599997)*f[29]; - tg = CONSTANT(0.121172043788000010)*g[11]+CONSTANT(-0.060142811686900000)*g[1]+CONSTANT(-0.099440056652700004)*g[25]+CONSTANT(0.034310079156599997)*g[29]; - y[17] += tf*g[32]+tg*f[32]; - y[32] += tf*g[17]+tg*f[17]; - t = f[17]*g[32]+f[32]*g[17]; - y[11] += CONSTANT(0.121172043788000010)*t; - y[1] += CONSTANT(-0.060142811686900000)*t; - y[25] += CONSTANT(-0.099440056652700004)*t; - y[29] += CONSTANT(0.034310079156599997)*t; - - // [17,34]: 29,11,1, - tf = CONSTANT(0.118853600623000000)*f[29]+CONSTANT(-0.029982305185400002)*f[11]+CONSTANT(-0.208340811100000000)*f[1]; - tg = CONSTANT(0.118853600623000000)*g[29]+CONSTANT(-0.029982305185400002)*g[11]+CONSTANT(-0.208340811100000000)*g[1]; - y[17] += tf*g[34]+tg*f[34]; - y[34] += tf*g[17]+tg*f[17]; - t = f[17]*g[34]+f[34]*g[17]; - y[29] += CONSTANT(0.118853600623000000)*t; - y[11] += CONSTANT(-0.029982305185400002)*t; - y[1] += CONSTANT(-0.208340811100000000)*t; - - // [18,18]: 6,0,20,24, - tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(-0.135045473384000000)*f[24]; - tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(-0.135045473384000000)*g[24]; - y[18] += tf*g[18]+tg*f[18]; - t = f[18]*g[18]; - y[6] += CONSTANT(0.065535909662600006)*t; - y[0] += CONSTANT(0.282094791771999980)*t; - y[20] += CONSTANT(-0.083698454702400005)*t; - y[24] += CONSTANT(-0.135045473384000000)*t; - - // [18,19]: 7,21,23, - tf = CONSTANT(0.090297865407399994)*f[7]+CONSTANT(0.102084782359000000)*f[21]+CONSTANT(-0.045015157794399997)*f[23]; - tg = CONSTANT(0.090297865407399994)*g[7]+CONSTANT(0.102084782359000000)*g[21]+CONSTANT(-0.045015157794399997)*g[23]; - y[18] += tf*g[19]+tg*f[19]; - y[19] += tf*g[18]+tg*f[18]; - t = f[18]*g[19]+f[19]*g[18]; - y[7] += CONSTANT(0.090297865407399994)*t; - y[21] += CONSTANT(0.102084782359000000)*t; - y[23] += CONSTANT(-0.045015157794399997)*t; - - // [18,25]: 15,33, - tf = CONSTANT(-0.098140130731999994)*f[15]+CONSTANT(0.130197596202000000)*f[33]; - tg = CONSTANT(-0.098140130731999994)*g[15]+CONSTANT(0.130197596202000000)*g[33]; - y[18] += tf*g[25]+tg*f[25]; - y[25] += tf*g[18]+tg*f[18]; - t = f[18]*g[25]+f[25]*g[18]; - y[15] += CONSTANT(-0.098140130731999994)*t; - y[33] += CONSTANT(0.130197596202000000)*t; - - // [18,26]: 14,32, - tf = CONSTANT(0.101358691174000000)*f[14]+CONSTANT(0.084042186965900004)*f[32]; - tg = CONSTANT(0.101358691174000000)*g[14]+CONSTANT(0.084042186965900004)*g[32]; - y[18] += tf*g[26]+tg*f[26]; - y[26] += tf*g[18]+tg*f[18]; - t = f[18]*g[26]+f[26]*g[18]; - y[14] += CONSTANT(0.101358691174000000)*t; - y[32] += CONSTANT(0.084042186965900004)*t; - - // [18,27]: 13,3,35, - tf = CONSTANT(0.101990215611000000)*f[13]+CONSTANT(0.183739324705999990)*f[3]+CONSTANT(-0.130197596202000000)*f[35]; - tg = CONSTANT(0.101990215611000000)*g[13]+CONSTANT(0.183739324705999990)*g[3]+CONSTANT(-0.130197596202000000)*g[35]; - y[18] += tf*g[27]+tg*f[27]; - y[27] += tf*g[18]+tg*f[18]; - t = f[18]*g[27]+f[27]*g[18]; - y[13] += CONSTANT(0.101990215611000000)*t; - y[3] += CONSTANT(0.183739324705999990)*t; - y[35] += CONSTANT(-0.130197596202000000)*t; - - // [18,28]: 2,12,30,34, - tf = CONSTANT(0.225033795606000010)*f[2]+CONSTANT(0.022664492358099999)*f[12]+CONSTANT(-0.099440056651100006)*f[30]+CONSTANT(-0.084042186968800003)*f[34]; - tg = CONSTANT(0.225033795606000010)*g[2]+CONSTANT(0.022664492358099999)*g[12]+CONSTANT(-0.099440056651100006)*g[30]+CONSTANT(-0.084042186968800003)*g[34]; - y[18] += tf*g[28]+tg*f[28]; - y[28] += tf*g[18]+tg*f[18]; - t = f[18]*g[28]+f[28]*g[18]; - y[2] += CONSTANT(0.225033795606000010)*t; - y[12] += CONSTANT(0.022664492358099999)*t; - y[30] += CONSTANT(-0.099440056651100006)*t; - y[34] += CONSTANT(-0.084042186968800003)*t; - - // [18,29]: 3,13,15,31, - tf = CONSTANT(-0.085054779966799998)*f[3]+CONSTANT(0.075189952564900006)*f[13]+CONSTANT(0.101584686310000010)*f[15]+CONSTANT(0.097043558538999999)*f[31]; - tg = CONSTANT(-0.085054779966799998)*g[3]+CONSTANT(0.075189952564900006)*g[13]+CONSTANT(0.101584686310000010)*g[15]+CONSTANT(0.097043558538999999)*g[31]; - y[18] += tf*g[29]+tg*f[29]; - y[29] += tf*g[18]+tg*f[18]; - t = f[18]*g[29]+f[29]*g[18]; - y[3] += CONSTANT(-0.085054779966799998)*t; - y[13] += CONSTANT(0.075189952564900006)*t; - y[15] += CONSTANT(0.101584686310000010)*t; - y[31] += CONSTANT(0.097043558538999999)*t; - - // [19,19]: 6,8,0,20,22, - tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(-0.141889406570999990)*f[8]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(-0.102084782360000000)*f[22]; - tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(-0.141889406570999990)*g[8]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(-0.102084782360000000)*g[22]; - y[19] += tf*g[19]+tg*f[19]; - t = f[19]*g[19]; - y[6] += CONSTANT(0.139263808033999990)*t; - y[8] += CONSTANT(-0.141889406570999990)*t; - y[0] += CONSTANT(0.282094791773999990)*t; - y[20] += CONSTANT(0.068480553847200004)*t; - y[22] += CONSTANT(-0.102084782360000000)*t; - - // [19,25]: 34, - tf = CONSTANT(-0.130197596205999990)*f[34]; - tg = CONSTANT(-0.130197596205999990)*g[34]; - y[19] += tf*g[25]+tg*f[25]; - y[25] += tf*g[19]+tg*f[19]; - t = f[19]*g[25]+f[25]*g[19]; - y[34] += CONSTANT(-0.130197596205999990)*t; - - // [19,26]: 15,35, - tf = CONSTANT(-0.131668802182000000)*f[15]+CONSTANT(0.130197596204999990)*f[35]; - tg = CONSTANT(-0.131668802182000000)*g[15]+CONSTANT(0.130197596204999990)*g[35]; - y[19] += tf*g[26]+tg*f[26]; - y[26] += tf*g[19]+tg*f[19]; - t = f[19]*g[26]+f[26]*g[19]; - y[15] += CONSTANT(-0.131668802182000000)*t; - y[35] += CONSTANT(0.130197596204999990)*t; - - // [19,27]: 14,32, - tf = CONSTANT(0.025339672793899998)*f[14]+CONSTANT(0.084042186967699994)*f[32]; - tg = CONSTANT(0.025339672793899998)*g[14]+CONSTANT(0.084042186967699994)*g[32]; - y[19] += tf*g[27]+tg*f[27]; - y[27] += tf*g[19]+tg*f[19]; - t = f[19]*g[27]+f[27]*g[19]; - y[14] += CONSTANT(0.025339672793899998)*t; - y[32] += CONSTANT(0.084042186967699994)*t; - - // [19,28]: 13,3,15,31,33, - tf = CONSTANT(0.104682806111000000)*f[13]+CONSTANT(0.159122922869999990)*f[3]+CONSTANT(-0.126698363970000010)*f[15]+CONSTANT(0.090775936911399999)*f[31]+CONSTANT(-0.084042186968400004)*f[33]; - tg = CONSTANT(0.104682806111000000)*g[13]+CONSTANT(0.159122922869999990)*g[3]+CONSTANT(-0.126698363970000010)*g[15]+CONSTANT(0.090775936911399999)*g[31]+CONSTANT(-0.084042186968400004)*g[33]; - y[19] += tf*g[28]+tg*f[28]; - y[28] += tf*g[19]+tg*f[19]; - t = f[19]*g[28]+f[28]*g[19]; - y[13] += CONSTANT(0.104682806111000000)*t; - y[3] += CONSTANT(0.159122922869999990)*t; - y[15] += CONSTANT(-0.126698363970000010)*t; - y[31] += CONSTANT(0.090775936911399999)*t; - y[33] += CONSTANT(-0.084042186968400004)*t; - - // [19,29]: 12,14,2,30,32, - tf = CONSTANT(0.115089467124000010)*f[12]+CONSTANT(-0.097749909977199997)*f[14]+CONSTANT(0.240571246744999990)*f[2]+CONSTANT(0.053152946072499999)*f[30]+CONSTANT(-0.090775936912099994)*f[32]; - tg = CONSTANT(0.115089467124000010)*g[12]+CONSTANT(-0.097749909977199997)*g[14]+CONSTANT(0.240571246744999990)*g[2]+CONSTANT(0.053152946072499999)*g[30]+CONSTANT(-0.090775936912099994)*g[32]; - y[19] += tf*g[29]+tg*f[29]; - y[29] += tf*g[19]+tg*f[19]; - t = f[19]*g[29]+f[29]*g[19]; - y[12] += CONSTANT(0.115089467124000010)*t; - y[14] += CONSTANT(-0.097749909977199997)*t; - y[2] += CONSTANT(0.240571246744999990)*t; - y[30] += CONSTANT(0.053152946072499999)*t; - y[32] += CONSTANT(-0.090775936912099994)*t; - - // [20,20]: 6,0,20, - tf = CONSTANT(0.163839797503000010)*f[6]+CONSTANT(0.282094802232000010)*f[0]; - tg = CONSTANT(0.163839797503000010)*g[6]+CONSTANT(0.282094802232000010)*g[0]; - y[20] += tf*g[20]+tg*f[20]; - t = f[20]*g[20]; - y[6] += CONSTANT(0.163839797503000010)*t; - y[0] += CONSTANT(0.282094802232000010)*t; - y[20] += CONSTANT(0.136961139005999990)*t; - - // [21,21]: 6,20,0,8,22, - tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.141889406570999990)*f[8]+CONSTANT(0.102084782360000000)*f[22]; - tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.141889406570999990)*g[8]+CONSTANT(0.102084782360000000)*g[22]; - y[21] += tf*g[21]+tg*f[21]; - t = f[21]*g[21]; - y[6] += CONSTANT(0.139263808033999990)*t; - y[20] += CONSTANT(0.068480553847200004)*t; - y[0] += CONSTANT(0.282094791773999990)*t; - y[8] += CONSTANT(0.141889406570999990)*t; - y[22] += CONSTANT(0.102084782360000000)*t; - - // [21,23]: 8,22,24, - tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(-0.119098912753000000)*f[24]; - tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(-0.119098912753000000)*g[24]; - y[21] += tf*g[23]+tg*f[23]; - y[23] += tf*g[21]+tg*f[21]; - t = f[21]*g[23]+f[23]*g[21]; - y[8] += CONSTANT(-0.112621225039000000)*t; - y[22] += CONSTANT(0.045015157794100001)*t; - y[24] += CONSTANT(-0.119098912753000000)*t; - - // [21,26]: 9,25, - tf = CONSTANT(-0.131668802182000000)*f[9]+CONSTANT(-0.130197596204999990)*f[25]; - tg = CONSTANT(-0.131668802182000000)*g[9]+CONSTANT(-0.130197596204999990)*g[25]; - y[21] += tf*g[26]+tg*f[26]; - y[26] += tf*g[21]+tg*f[21]; - t = f[21]*g[26]+f[26]*g[21]; - y[9] += CONSTANT(-0.131668802182000000)*t; - y[25] += CONSTANT(-0.130197596204999990)*t; - - // [21,28]: 27,1,11,9,29, - tf = CONSTANT(0.084042186968400004)*f[27]+CONSTANT(0.159122922869999990)*f[1]+CONSTANT(0.104682806111000000)*f[11]+CONSTANT(0.126698363970000010)*f[9]+CONSTANT(0.090775936911399999)*f[29]; - tg = CONSTANT(0.084042186968400004)*g[27]+CONSTANT(0.159122922869999990)*g[1]+CONSTANT(0.104682806111000000)*g[11]+CONSTANT(0.126698363970000010)*g[9]+CONSTANT(0.090775936911399999)*g[29]; - y[21] += tf*g[28]+tg*f[28]; - y[28] += tf*g[21]+tg*f[21]; - t = f[21]*g[28]+f[28]*g[21]; - y[27] += CONSTANT(0.084042186968400004)*t; - y[1] += CONSTANT(0.159122922869999990)*t; - y[11] += CONSTANT(0.104682806111000000)*t; - y[9] += CONSTANT(0.126698363970000010)*t; - y[29] += CONSTANT(0.090775936911399999)*t; - - // [21,31]: 14,2,30,12,32, - tf = CONSTANT(0.097749909977199997)*f[14]+CONSTANT(0.240571246744999990)*f[2]+CONSTANT(0.053152946072499999)*f[30]+CONSTANT(0.115089467124000010)*f[12]+CONSTANT(0.090775936912099994)*f[32]; - tg = CONSTANT(0.097749909977199997)*g[14]+CONSTANT(0.240571246744999990)*g[2]+CONSTANT(0.053152946072499999)*g[30]+CONSTANT(0.115089467124000010)*g[12]+CONSTANT(0.090775936912099994)*g[32]; - y[21] += tf*g[31]+tg*f[31]; - y[31] += tf*g[21]+tg*f[21]; - t = f[21]*g[31]+f[31]*g[21]; - y[14] += CONSTANT(0.097749909977199997)*t; - y[2] += CONSTANT(0.240571246744999990)*t; - y[30] += CONSTANT(0.053152946072499999)*t; - y[12] += CONSTANT(0.115089467124000010)*t; - y[32] += CONSTANT(0.090775936912099994)*t; - - // [21,33]: 32,14, - tf = CONSTANT(0.084042186967699994)*f[32]+CONSTANT(0.025339672793899998)*f[14]; - tg = CONSTANT(0.084042186967699994)*g[32]+CONSTANT(0.025339672793899998)*g[14]; - y[21] += tf*g[33]+tg*f[33]; - y[33] += tf*g[21]+tg*f[21]; - t = f[21]*g[33]+f[33]*g[21]; - y[32] += CONSTANT(0.084042186967699994)*t; - y[14] += CONSTANT(0.025339672793899998)*t; - - // [21,34]: 35, - tf = CONSTANT(-0.130197596205999990)*f[35]; - tg = CONSTANT(-0.130197596205999990)*g[35]; - y[21] += tf*g[34]+tg*f[34]; - y[34] += tf*g[21]+tg*f[21]; - t = f[21]*g[34]+f[34]*g[21]; - y[35] += CONSTANT(-0.130197596205999990)*t; - - // [22,22]: 6,20,0,24, - tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(0.135045473384000000)*f[24]; - tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(0.135045473384000000)*g[24]; - y[22] += tf*g[22]+tg*f[22]; - t = f[22]*g[22]; - y[6] += CONSTANT(0.065535909662600006)*t; - y[20] += CONSTANT(-0.083698454702400005)*t; - y[0] += CONSTANT(0.282094791771999980)*t; - y[24] += CONSTANT(0.135045473384000000)*t; - - // [22,26]: 10,28, - tf = CONSTANT(0.101358691174000000)*f[10]+CONSTANT(0.084042186965900004)*f[28]; - tg = CONSTANT(0.101358691174000000)*g[10]+CONSTANT(0.084042186965900004)*g[28]; - y[22] += tf*g[26]+tg*f[26]; - y[26] += tf*g[22]+tg*f[22]; - t = f[22]*g[26]+f[26]*g[22]; - y[10] += CONSTANT(0.101358691174000000)*t; - y[28] += CONSTANT(0.084042186965900004)*t; - - // [22,27]: 1,11,25, - tf = CONSTANT(0.183739324704000010)*f[1]+CONSTANT(0.101990215611000000)*f[11]+CONSTANT(0.130197596200999990)*f[25]; - tg = CONSTANT(0.183739324704000010)*g[1]+CONSTANT(0.101990215611000000)*g[11]+CONSTANT(0.130197596200999990)*g[25]; - y[22] += tf*g[27]+tg*f[27]; - y[27] += tf*g[22]+tg*f[22]; - t = f[22]*g[27]+f[27]*g[22]; - y[1] += CONSTANT(0.183739324704000010)*t; - y[11] += CONSTANT(0.101990215611000000)*t; - y[25] += CONSTANT(0.130197596200999990)*t; - - // [22,32]: 2,30,12,34, - tf = CONSTANT(0.225033795606000010)*f[2]+CONSTANT(-0.099440056651100006)*f[30]+CONSTANT(0.022664492358099999)*f[12]+CONSTANT(0.084042186968800003)*f[34]; - tg = CONSTANT(0.225033795606000010)*g[2]+CONSTANT(-0.099440056651100006)*g[30]+CONSTANT(0.022664492358099999)*g[12]+CONSTANT(0.084042186968800003)*g[34]; - y[22] += tf*g[32]+tg*f[32]; - y[32] += tf*g[22]+tg*f[22]; - t = f[22]*g[32]+f[32]*g[22]; - y[2] += CONSTANT(0.225033795606000010)*t; - y[30] += CONSTANT(-0.099440056651100006)*t; - y[12] += CONSTANT(0.022664492358099999)*t; - y[34] += CONSTANT(0.084042186968800003)*t; - - // [22,33]: 3,13,35, - tf = CONSTANT(0.183739324704000010)*f[3]+CONSTANT(0.101990215611000000)*f[13]+CONSTANT(0.130197596200999990)*f[35]; - tg = CONSTANT(0.183739324704000010)*g[3]+CONSTANT(0.101990215611000000)*g[13]+CONSTANT(0.130197596200999990)*g[35]; - y[22] += tf*g[33]+tg*f[33]; - y[33] += tf*g[22]+tg*f[22]; - t = f[22]*g[33]+f[33]*g[22]; - y[3] += CONSTANT(0.183739324704000010)*t; - y[13] += CONSTANT(0.101990215611000000)*t; - y[35] += CONSTANT(0.130197596200999990)*t; - - // [23,23]: 6,20,0, - tf = CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20]+CONSTANT(0.282094791768999990)*f[0]; - tg = CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20]+CONSTANT(0.282094791768999990)*g[0]; - y[23] += tf*g[23]+tg*f[23]; - t = f[23]*g[23]; - y[6] += CONSTANT(-0.057343920955899998)*t; - y[20] += CONSTANT(-0.159787958979000000)*t; - y[0] += CONSTANT(0.282094791768999990)*t; - - // [23,26]: 1,11,29, - tf = CONSTANT(0.208340811096000000)*f[1]+CONSTANT(0.029982305185199998)*f[11]+CONSTANT(-0.118853600623999990)*f[29]; - tg = CONSTANT(0.208340811096000000)*g[1]+CONSTANT(0.029982305185199998)*g[11]+CONSTANT(-0.118853600623999990)*g[29]; - y[23] += tf*g[26]+tg*f[26]; - y[26] += tf*g[23]+tg*f[23]; - t = f[23]*g[26]+f[26]*g[23]; - y[1] += CONSTANT(0.208340811096000000)*t; - y[11] += CONSTANT(0.029982305185199998)*t; - y[29] += CONSTANT(-0.118853600623999990)*t; - - // [23,28]: 25,11,1,29, - tf = CONSTANT(-0.099440056652200001)*f[25]+CONSTANT(-0.121172043789000000)*f[11]+CONSTANT(0.060142811686500000)*f[1]+CONSTANT(-0.034310079156700000)*f[29]; - tg = CONSTANT(-0.099440056652200001)*g[25]+CONSTANT(-0.121172043789000000)*g[11]+CONSTANT(0.060142811686500000)*g[1]+CONSTANT(-0.034310079156700000)*g[29]; - y[23] += tf*g[28]+tg*f[28]; - y[28] += tf*g[23]+tg*f[23]; - t = f[23]*g[28]+f[28]*g[23]; - y[25] += CONSTANT(-0.099440056652200001)*t; - y[11] += CONSTANT(-0.121172043789000000)*t; - y[1] += CONSTANT(0.060142811686500000)*t; - y[29] += CONSTANT(-0.034310079156700000)*t; - - // [23,32]: 31,13,3,35, - tf = CONSTANT(0.034310079156599997)*f[31]+CONSTANT(0.121172043788000010)*f[13]+CONSTANT(-0.060142811686900000)*f[3]+CONSTANT(-0.099440056652700004)*f[35]; - tg = CONSTANT(0.034310079156599997)*g[31]+CONSTANT(0.121172043788000010)*g[13]+CONSTANT(-0.060142811686900000)*g[3]+CONSTANT(-0.099440056652700004)*g[35]; - y[23] += tf*g[32]+tg*f[32]; - y[32] += tf*g[23]+tg*f[23]; - t = f[23]*g[32]+f[32]*g[23]; - y[31] += CONSTANT(0.034310079156599997)*t; - y[13] += CONSTANT(0.121172043788000010)*t; - y[3] += CONSTANT(-0.060142811686900000)*t; - y[35] += CONSTANT(-0.099440056652700004)*t; - - // [23,33]: 2,30,12, - tf = CONSTANT(0.196425600433000000)*f[2]+CONSTANT(-0.130197596204999990)*f[30]+CONSTANT(-0.103861751821000010)*f[12]; - tg = CONSTANT(0.196425600433000000)*g[2]+CONSTANT(-0.130197596204999990)*g[30]+CONSTANT(-0.103861751821000010)*g[12]; - y[23] += tf*g[33]+tg*f[33]; - y[33] += tf*g[23]+tg*f[23]; - t = f[23]*g[33]+f[33]*g[23]; - y[2] += CONSTANT(0.196425600433000000)*t; - y[30] += CONSTANT(-0.130197596204999990)*t; - y[12] += CONSTANT(-0.103861751821000010)*t; - - // [23,34]: 3,13,31, - tf = CONSTANT(0.208340811100000000)*f[3]+CONSTANT(0.029982305185400002)*f[13]+CONSTANT(-0.118853600623000000)*f[31]; - tg = CONSTANT(0.208340811100000000)*g[3]+CONSTANT(0.029982305185400002)*g[13]+CONSTANT(-0.118853600623000000)*g[31]; - y[23] += tf*g[34]+tg*f[34]; - y[34] += tf*g[23]+tg*f[23]; - t = f[23]*g[34]+f[34]*g[23]; - y[3] += CONSTANT(0.208340811100000000)*t; - y[13] += CONSTANT(0.029982305185400002)*t; - y[31] += CONSTANT(-0.118853600623000000)*t; - - // [24,24]: 6,0,20, - tf = CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.282094791763999990)*f[0]+CONSTANT(0.106525305981000000)*f[20]; - tg = CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.282094791763999990)*g[0]+CONSTANT(0.106525305981000000)*g[20]; - y[24] += tf*g[24]+tg*f[24]; - t = f[24]*g[24]; - y[6] += CONSTANT(-0.229375683829000000)*t; - y[0] += CONSTANT(0.282094791763999990)*t; - y[20] += CONSTANT(0.106525305981000000)*t; - - // [24,29]: 9,27,25, - tf = CONSTANT(-0.035835708931400000)*f[9]+CONSTANT(0.118853600623000000)*f[27]+CONSTANT(0.053152946071199997)*f[25]; - tg = CONSTANT(-0.035835708931400000)*g[9]+CONSTANT(0.118853600623000000)*g[27]+CONSTANT(0.053152946071199997)*g[25]; - y[24] += tf*g[29]+tg*f[29]; - y[29] += tf*g[24]+tg*f[24]; - t = f[24]*g[29]+f[29]*g[24]; - y[9] += CONSTANT(-0.035835708931400000)*t; - y[27] += CONSTANT(0.118853600623000000)*t; - y[25] += CONSTANT(0.053152946071199997)*t; - - // [24,31]: 15,33,35, - tf = CONSTANT(0.035835708931400000)*f[15]+CONSTANT(-0.118853600623000000)*f[33]+CONSTANT(0.053152946071199997)*f[35]; - tg = CONSTANT(0.035835708931400000)*g[15]+CONSTANT(-0.118853600623000000)*g[33]+CONSTANT(0.053152946071199997)*g[35]; - y[24] += tf*g[31]+tg*f[31]; - y[31] += tf*g[24]+tg*f[24]; - t = f[24]*g[31]+f[31]*g[24]; - y[15] += CONSTANT(0.035835708931400000)*t; - y[33] += CONSTANT(-0.118853600623000000)*t; - y[35] += CONSTANT(0.053152946071199997)*t; - - // [24,34]: 12,30,2, - tf = CONSTANT(-0.207723503645000000)*f[12]+CONSTANT(0.130197596199999990)*f[30]+CONSTANT(0.147319200325000010)*f[2]; - tg = CONSTANT(-0.207723503645000000)*g[12]+CONSTANT(0.130197596199999990)*g[30]+CONSTANT(0.147319200325000010)*g[2]; - y[24] += tf*g[34]+tg*f[34]; - y[34] += tf*g[24]+tg*f[24]; - t = f[24]*g[34]+f[34]*g[24]; - y[12] += CONSTANT(-0.207723503645000000)*t; - y[30] += CONSTANT(0.130197596199999990)*t; - y[2] += CONSTANT(0.147319200325000010)*t; - - // [25,25]: 0,6,20, - tf = CONSTANT(0.282094791761999970)*f[0]+CONSTANT(-0.242608896358999990)*f[6]+CONSTANT(0.130197596198000000)*f[20]; - tg = CONSTANT(0.282094791761999970)*g[0]+CONSTANT(-0.242608896358999990)*g[6]+CONSTANT(0.130197596198000000)*g[20]; - y[25] += tf*g[25]+tg*f[25]; - t = f[25]*g[25]; - y[0] += CONSTANT(0.282094791761999970)*t; - y[6] += CONSTANT(-0.242608896358999990)*t; - y[20] += CONSTANT(0.130197596198000000)*t; - - // [26,26]: 6,20,0, - tf = CONSTANT(-0.097043558542400002)*f[6]+CONSTANT(-0.130197596207000000)*f[20]+CONSTANT(0.282094791766000000)*f[0]; - tg = CONSTANT(-0.097043558542400002)*g[6]+CONSTANT(-0.130197596207000000)*g[20]+CONSTANT(0.282094791766000000)*g[0]; - y[26] += tf*g[26]+tg*f[26]; - t = f[26]*g[26]; - y[6] += CONSTANT(-0.097043558542400002)*t; - y[20] += CONSTANT(-0.130197596207000000)*t; - y[0] += CONSTANT(0.282094791766000000)*t; - - // [27,27]: 0,20,6, - tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.130197596204999990)*f[20]+CONSTANT(0.016173926423100001)*f[6]; - tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.130197596204999990)*g[20]+CONSTANT(0.016173926423100001)*g[6]; - y[27] += tf*g[27]+tg*f[27]; - t = f[27]*g[27]; - y[0] += CONSTANT(0.282094791770000020)*t; - y[20] += CONSTANT(-0.130197596204999990)*t; - y[6] += CONSTANT(0.016173926423100001)*t; - - // [28,28]: 6,0,20,24, - tf = CONSTANT(0.097043558538800007)*f[6]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.021699599367299999)*f[20]+CONSTANT(-0.128376561118000000)*f[24]; - tg = CONSTANT(0.097043558538800007)*g[6]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.021699599367299999)*g[20]+CONSTANT(-0.128376561118000000)*g[24]; - y[28] += tf*g[28]+tg*f[28]; - t = f[28]*g[28]; - y[6] += CONSTANT(0.097043558538800007)*t; - y[0] += CONSTANT(0.282094791771999980)*t; - y[20] += CONSTANT(-0.021699599367299999)*t; - y[24] += CONSTANT(-0.128376561118000000)*t; - - // [29,29]: 20,6,0,22,8, - tf = CONSTANT(0.086798397468799998)*f[20]+CONSTANT(0.145565337808999990)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(-0.097043558539500002)*f[22]+CONSTANT(-0.140070311615000000)*f[8]; - tg = CONSTANT(0.086798397468799998)*g[20]+CONSTANT(0.145565337808999990)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(-0.097043558539500002)*g[22]+CONSTANT(-0.140070311615000000)*g[8]; - y[29] += tf*g[29]+tg*f[29]; - t = f[29]*g[29]; - y[20] += CONSTANT(0.086798397468799998)*t; - y[6] += CONSTANT(0.145565337808999990)*t; - y[0] += CONSTANT(0.282094791773999990)*t; - y[22] += CONSTANT(-0.097043558539500002)*t; - y[8] += CONSTANT(-0.140070311615000000)*t; - - // [30,30]: 0,20,6, - tf = CONSTANT(0.282094804531000000)*f[0]+CONSTANT(0.130197634486000000)*f[20]+CONSTANT(0.161739292769000010)*f[6]; - tg = CONSTANT(0.282094804531000000)*g[0]+CONSTANT(0.130197634486000000)*g[20]+CONSTANT(0.161739292769000010)*g[6]; - y[30] += tf*g[30]+tg*f[30]; - t = f[30]*g[30]; - y[0] += CONSTANT(0.282094804531000000)*t; - y[20] += CONSTANT(0.130197634486000000)*t; - y[6] += CONSTANT(0.161739292769000010)*t; - - // [31,31]: 6,8,20,22,0, - tf = CONSTANT(0.145565337808999990)*f[6]+CONSTANT(0.140070311615000000)*f[8]+CONSTANT(0.086798397468799998)*f[20]+CONSTANT(0.097043558539500002)*f[22]+CONSTANT(0.282094791773999990)*f[0]; - tg = CONSTANT(0.145565337808999990)*g[6]+CONSTANT(0.140070311615000000)*g[8]+CONSTANT(0.086798397468799998)*g[20]+CONSTANT(0.097043558539500002)*g[22]+CONSTANT(0.282094791773999990)*g[0]; - y[31] += tf*g[31]+tg*f[31]; - t = f[31]*g[31]; - y[6] += CONSTANT(0.145565337808999990)*t; - y[8] += CONSTANT(0.140070311615000000)*t; - y[20] += CONSTANT(0.086798397468799998)*t; - y[22] += CONSTANT(0.097043558539500002)*t; - y[0] += CONSTANT(0.282094791773999990)*t; - - // [32,32]: 0,24,20,6, - tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(0.128376561118000000)*f[24]+CONSTANT(-0.021699599367299999)*f[20]+CONSTANT(0.097043558538800007)*f[6]; - tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(0.128376561118000000)*g[24]+CONSTANT(-0.021699599367299999)*g[20]+CONSTANT(0.097043558538800007)*g[6]; - y[32] += tf*g[32]+tg*f[32]; - t = f[32]*g[32]; - y[0] += CONSTANT(0.282094791771999980)*t; - y[24] += CONSTANT(0.128376561118000000)*t; - y[20] += CONSTANT(-0.021699599367299999)*t; - y[6] += CONSTANT(0.097043558538800007)*t; - - // [33,33]: 6,20,0, - tf = CONSTANT(0.016173926423100001)*f[6]+CONSTANT(-0.130197596204999990)*f[20]+CONSTANT(0.282094791770000020)*f[0]; - tg = CONSTANT(0.016173926423100001)*g[6]+CONSTANT(-0.130197596204999990)*g[20]+CONSTANT(0.282094791770000020)*g[0]; - y[33] += tf*g[33]+tg*f[33]; - t = f[33]*g[33]; - y[6] += CONSTANT(0.016173926423100001)*t; - y[20] += CONSTANT(-0.130197596204999990)*t; - y[0] += CONSTANT(0.282094791770000020)*t; - - // [34,34]: 20,6,0, - tf = CONSTANT(-0.130197596207000000)*f[20]+CONSTANT(-0.097043558542400002)*f[6]+CONSTANT(0.282094791766000000)*f[0]; - tg = CONSTANT(-0.130197596207000000)*g[20]+CONSTANT(-0.097043558542400002)*g[6]+CONSTANT(0.282094791766000000)*g[0]; - y[34] += tf*g[34]+tg*f[34]; - t = f[34]*g[34]; - y[20] += CONSTANT(-0.130197596207000000)*t; - y[6] += CONSTANT(-0.097043558542400002)*t; - y[0] += CONSTANT(0.282094791766000000)*t; - - // [35,35]: 6,0,20, - tf = CONSTANT(-0.242608896358999990)*f[6]+CONSTANT(0.282094791761999970)*f[0]+CONSTANT(0.130197596198000000)*f[20]; - tg = CONSTANT(-0.242608896358999990)*g[6]+CONSTANT(0.282094791761999970)*g[0]+CONSTANT(0.130197596198000000)*g[20]; - y[35] += tf*g[35]+tg*f[35]; - t = f[35]*g[35]; - y[6] += CONSTANT(-0.242608896358999990)*t; - y[0] += CONSTANT(0.282094791761999970)*t; - y[20] += CONSTANT(0.130197596198000000)*t; - - // multiply count=2527 - - return y; -} - - -//------------------------------------------------------------------------------------- -// Evaluates a directional light and returns spectral SH data. The output -// vector is computed so that if the intensity of R/G/B is unit the resulting -// exit radiance of a point directly under the light on a diffuse object with -// an albedo of 1 would be 1.0. This will compute 3 spectral samples, resultR -// has to be specified, while resultG and resultB are optional. -// -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204988.aspx -//------------------------------------------------------------------------------------- -bool XM_CALLCONV XMSHEvalDirectionalLight( _In_ size_t order, - _In_ FXMVECTOR dir, - _In_ FXMVECTOR color, - _Out_writes_(order*order) float *resultR, - _Out_writes_opt_(order*order) float *resultG, - _Out_writes_opt_(order*order) float *resultB ) -{ - if ( !resultR ) - return false; - - if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ) - return false; - - XMFLOAT3A clr; - XMStoreFloat3A( &clr, color ); - - float fTmp[ XM_SH_MAXORDER * XM_SH_MAXORDER ]; - - XMSHEvalDirection(fTmp,order,dir); // evaluate the BF in this direction... - - // now compute "normalization" and scale vector for each valid spectral band - const float fNorm = XM_PI / CosWtInt(order); - - const size_t numcoeff = order*order; - - const float fRScale = fNorm * clr.x; - - for( size_t i=0; i < numcoeff; ++i) - { - resultR[i] = fTmp[i] * fRScale; - } - - if (resultG) - { - const float fGScale = fNorm * clr.y; - - for( size_t i=0; i < numcoeff; ++i) - { - resultG[i] = fTmp[i] * fGScale; - } - } - - if (resultB) - { - const float fBScale = fNorm * clr.z; - - for( size_t i=0; i < numcoeff; ++i) - { - resultB[i] = fTmp[i]*fBScale; - } - } - - return true; -} - - -//------------------------------------------------------------------------------------ -// Evaluates a spherical light and returns spectral SH data. There is no -// normalization of the intensity of the light like there is for directional -// lights, care has to be taken when specifiying the intensities. This will -// compute 3 spectral samples, resultR has to be specified, while resultG and -// resultB are optional. -// -// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205451.aspx -//------------------------------------------------------------------------------------- -bool XM_CALLCONV XMSHEvalSphericalLight( _In_ size_t order, - _In_ FXMVECTOR pos, - _In_ float radius, - _In_ FXMVECTOR color, - _Out_writes_(order*order) float *resultR, - _Out_writes_opt_(order*order) float *resultG, - _Out_writes_opt_(order*order) float *resultB ) -{ - if ( !resultR ) - return false; - - if ( radius < 0.f ) - return false; - - const float fDist = XMVectorGetX( XMVector3Length( pos ) ); - - // WARNING: fDist should not be < radius - otherwise light contains origin - - //const float fSinConeAngle = (fDist <= radius) ? 0.99999f : radius/fDist; - const float fConeAngle = (fDist <= radius) ? (XM_PIDIV2) : asinf(radius/fDist); - - XMVECTOR dir = XMVector3Normalize( pos ); - - float fTmpDir[ XM_SH_MAXORDER* XM_SH_MAXORDER]; // rotation "vector" - float fTmpL0[ XM_SH_MAXORDER ]; - - // - // Sphere at distance fDist, the cone angle is determined by looking at the - // right triangle with one side (the hypotenuse) beind the vector from the - // origin to the center of the sphere, another side is from the origin to - // a point on the sphere whose normal is perpendicular to the given side (this - // is one of the points on the cone that is defined by the projection of the sphere - // through the origin - we want to find the angle of this cone) and the final - // side being from the center of the sphere to the point of tagency (the two - // sides conected to this are at a right angle by construction.) - // From trig we know that sin(theta) = ||opposite||/||hypotenuse||, where - // ||opposite|| = Radius, ||hypotenuse|| = fDist - // theta is the angle of the cone that subtends the sphere from the origin - // - - // no default normalization is done for this case, have to be careful how - // you represent the coefficients... - - const float fNewNorm = 1.0f;///(fSinConeAngle*fSinConeAngle); - - ComputeCapInt(order,fConeAngle,fTmpL0); - - XMFLOAT3A vd; - XMStoreFloat3( &vd, dir ); - - const float fX = vd.x; - const float fY = vd.y; - const float fZ = vd.z; - - switch (order) - { - case 2: - sh_eval_basis_1(fX,fY,fZ,fTmpDir); - break; - - case 3: - sh_eval_basis_2(fX,fY,fZ,fTmpDir); - break; - - case 4: - sh_eval_basis_3(fX,fY,fZ,fTmpDir); - break; - - case 5: - sh_eval_basis_4(fX,fY,fZ,fTmpDir); - break; - - case 6: - sh_eval_basis_5(fX,fY,fZ,fTmpDir); - break; - - default: - assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ); - return false; - } - - XMFLOAT3A clr; - XMStoreFloat3A( &clr, color ); - - for( size_t i=0; i (XM_PI*1.00001f) ) - return false; - - if (radius < 0.0001f) - { - // turn it into a pure directional light... - return XMSHEvalDirectionalLight(order, dir,color,resultR,resultG,resultB); - } - else - { - float fTmpL0[ XM_SH_MAXORDER ]; - float fTmpDir[ XM_SH_MAXORDER * XM_SH_MAXORDER ]; - - const float fConeAngle = radius; - const float fAngCheck = (fConeAngle > XM_PIDIV2) ? (XM_PIDIV2) : fConeAngle; - - const float fNewNorm = 1.0f/(sinf(fAngCheck)*sinf(fAngCheck)); - - ComputeCapInt(order,fConeAngle,fTmpL0); - - XMFLOAT3A vd; - XMStoreFloat3( &vd, dir ); - - const float fX = vd.x; - const float fY = vd.y; - const float fZ = vd.z; - - switch (order) - { - case 2: - sh_eval_basis_1(fX,fY,fZ,fTmpDir); - break; - - case 3: - sh_eval_basis_2(fX,fY,fZ,fTmpDir); - break; - - case 4: - sh_eval_basis_3(fX,fY,fZ,fTmpDir); - break; - - case 5: - sh_eval_basis_4(fX,fY,fZ,fTmpDir); - break; - - case 6: - sh_eval_basis_5(fX,fY,fZ,fTmpDir); - break; - - default: - assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ); - return false; - } - - XMFLOAT3A clr; - XMStoreFloat3A( &clr, color ); - - for( size_t i=0; i XM_SH_MAXORDER ) - return false; - - // seperate "R/G/B colors... - - float fTmpDir[ XM_SH_MAXORDER * XM_SH_MAXORDER]; // rotation "vector" - float fTmpL0[ XM_SH_MAXORDER ]; - - const float fNewNorm = 3.0f/2.0f; // normalizes things for 1 sky color, 0 ground color... - - XMFLOAT3A vd; - XMStoreFloat3( &vd, dir ); - - const float fX = vd.x; - const float fY = vd.y; - const float fZ = vd.z; - - sh_eval_basis_1(fX,fY,fZ,fTmpDir); - - XMFLOAT3A clrTop; - XMStoreFloat3A( &clrTop, topColor ); - - XMFLOAT3A clrBottom; - XMStoreFloat3A( &clrBottom, bottomColor ); - - float fA = clrTop.x; - float fAvrg = (clrTop.x + clrBottom.x)*0.5f; - - fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi; - fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3; - - size_t i = 0; - for( ; i<2; ++i) - { - _Analysis_assume_(i < order); - const size_t cNumCoefs = 2*i + 1; - const size_t cStart = i*i; - const float fValUse = fTmpL0[i]*fNewNorm*fExtraNormFac[i]; - for( size_t j=0; j + +using namespace DirectX; + +#pragma warning( disable : 4619 4456 ) + +namespace +{ + #pragma prefast(disable:246, "generated code by maple (nested const variable definitions)") + + static const float fExtraNormFac[XM_SH_MAXORDER] = { 2.0f*sqrtf(XM_PI), 2.0f/3.0f*sqrtf(3.0f*XM_PI), 2.0f/5.0f*sqrtf(5.0f*XM_PI), 2.0f/7.0f*sqrtf(7.0f*XM_PI), 2.0f/3.0f*sqrtf(XM_PI), 2.0f/11.0f*sqrtf(11.0f*XM_PI) }; + + // computes the integral of a constant function over a solid angular + // extent. No error checking - only used internaly. This function + // only returns the Yl0 coefficients, since the rest are zero for + // circularly symmetric functions. + static const float ComputeCapInt_t1 = sqrtf(0.3141593E1f); + static const float ComputeCapInt_t5 = sqrtf(3.0f); + static const float ComputeCapInt_t11 = sqrtf(5.0f); + static const float ComputeCapInt_t18 = sqrtf(7.0f); + static const float ComputeCapInt_t32 = sqrtf(11.0f); + + static inline void ComputeCapInt(const size_t order, float angle, float *pR) + { + const float t2 = cosf(angle); + const float t3 = ComputeCapInt_t1*t2; + const float t7 = sinf(angle); + const float t8 = t7*t7; + + + pR[0] = -t3+ComputeCapInt_t1; + pR[1] = ComputeCapInt_t5*ComputeCapInt_t1*t8/2.0f; + + if (order > 2) + { + const float t13 = t2*t2; + + pR[2] = -ComputeCapInt_t11*ComputeCapInt_t1*t2*(t13-1.0f)/2.0f; + if (order > 3) + { + const float t19 = ComputeCapInt_t18*ComputeCapInt_t1; + const float t20 = t13*t13; + + pR[3] = -5.0f/8.0f*t19*t20+3.0f/4.0f*t19*t13-t19/8.0f; + if (order > 4) + { + + + pR[4] = -3.0f/8.0f*t3*(7.0f*t20-10.0f*t13+3.0f); + if (order > 5) + { + const float t33 = ComputeCapInt_t32*ComputeCapInt_t1; + pR[5] = -21.0f/16.0f*t33*t20*t13+35.0f/16.0f*t33*t20-15.0f/16.0f*t33*t13+t33/16.0f; + } + } + } + } + } + + // input pF only consists of Yl0 values, normalizes coefficients for directional + // lights. + static inline float CosWtInt(const size_t order) + { + const float fCW0 = 0.25f; + const float fCW1 = 0.5f; + const float fCW2 = 5.0f/16.0f; + //const float fCW3 = 0.0f; + const float fCW4 = -3.0f/32.0f; + //const float fCW5 = 0.0f; + + // order has to be at least linear... + + float fRet = fCW0 + fCW1; + + if (order > 2) fRet += fCW2; + if (order > 4) fRet += fCW4; + + // odd degrees >= 3 evaluate to zero integrated against cosine... + + return fRet; + } + + static const float SHEvalHemisphereLight_fSqrtPi = sqrtf(XM_PI); + static const float SHEvalHemisphereLight_fSqrtPi3 = sqrtf(XM_PI/3.0f); + + typedef float REAL; + #define CONSTANT(x) (x ## f) + + // routine generated programmatically for evaluating SH basis for degree 1 + // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) + // output is vector b with SH basis evaluated at (x,y,z). + // + inline static void sh_eval_basis_1(REAL x,REAL y,REAL z,REAL b[4]) + { + /* m=0 */ + + // l=0 + const REAL p_0_0 = CONSTANT(0.282094791773878140); + b[ 0] = p_0_0; // l=0,m=0 + // l=1 + const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; + b[ 2] = p_1_0; // l=1,m=0 + + + /* m=1 */ + + const REAL s1 = y; + const REAL c1 = x; + + // l=1 + const REAL p_1_1 = CONSTANT(-0.488602511902919920); + b[ 1] = p_1_1*s1; // l=1,m=-1 + b[ 3] = p_1_1*c1; // l=1,m=+1 + } + + // routine generated programmatically for evaluating SH basis for degree 2 + // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) + // output is vector b with SH basis evaluated at (x,y,z). + // + inline static void sh_eval_basis_2(REAL x,REAL y,REAL z,REAL b[9]) + { + const REAL z2 = z*z; + + + /* m=0 */ + + // l=0 + const REAL p_0_0 = CONSTANT(0.282094791773878140); + b[ 0] = p_0_0; // l=0,m=0 + // l=1 + const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; + b[ 2] = p_1_0; // l=1,m=0 + // l=2 + const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050); + b[ 6] = p_2_0; // l=2,m=0 + + + /* m=1 */ + + const REAL s1 = y; + const REAL c1 = x; + + // l=1 + const REAL p_1_1 = CONSTANT(-0.488602511902919920); + b[ 1] = p_1_1*s1; // l=1,m=-1 + b[ 3] = p_1_1*c1; // l=1,m=+1 + // l=2 + const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z; + b[ 5] = p_2_1*s1; // l=2,m=-1 + b[ 7] = p_2_1*c1; // l=2,m=+1 + + + /* m=2 */ + + const REAL s2 = x*s1 + y*c1; + const REAL c2 = x*c1 - y*s1; + + // l=2 + const REAL p_2_2 = CONSTANT(0.546274215296039590); + b[ 4] = p_2_2*s2; // l=2,m=-2 + b[ 8] = p_2_2*c2; // l=2,m=+2 + } + + // routine generated programmatically for evaluating SH basis for degree 3 + // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) + // output is vector b with SH basis evaluated at (x,y,z). + // + static void sh_eval_basis_3(REAL x,REAL y,REAL z,REAL b[16]) + { + const REAL z2 = z*z; + + + /* m=0 */ + + // l=0 + const REAL p_0_0 = CONSTANT(0.282094791773878140); + b[ 0] = p_0_0; // l=0,m=0 + // l=1 + const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; + b[ 2] = p_1_0; // l=1,m=0 + // l=2 + const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050); + b[ 6] = p_2_0; // l=2,m=0 + // l=3 + const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200)); + b[ 12] = p_3_0; // l=3,m=0 + + + /* m=1 */ + + const REAL s1 = y; + const REAL c1 = x; + + // l=1 + const REAL p_1_1 = CONSTANT(-0.488602511902919920); + b[ 1] = p_1_1*s1; // l=1,m=-1 + b[ 3] = p_1_1*c1; // l=1,m=+1 + // l=2 + const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z; + b[ 5] = p_2_1*s1; // l=2,m=-1 + b[ 7] = p_2_1*c1; // l=2,m=+1 + // l=3 + const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770); + b[ 11] = p_3_1*s1; // l=3,m=-1 + b[ 13] = p_3_1*c1; // l=3,m=+1 + + + /* m=2 */ + + const REAL s2 = x*s1 + y*c1; + const REAL c2 = x*c1 - y*s1; + + // l=2 + const REAL p_2_2 = CONSTANT(0.546274215296039590); + b[ 4] = p_2_2*s2; // l=2,m=-2 + b[ 8] = p_2_2*c2; // l=2,m=+2 + // l=3 + const REAL p_3_2 = CONSTANT(1.445305721320277100)*z; + b[ 10] = p_3_2*s2; // l=3,m=-2 + b[ 14] = p_3_2*c2; // l=3,m=+2 + + + /* m=3 */ + + const REAL s3 = x*s2 + y*c2; + const REAL c3 = x*c2 - y*s2; + + // l=3 + const REAL p_3_3 = CONSTANT(-0.590043589926643520); + b[ 9] = p_3_3*s3; // l=3,m=-3 + b[ 15] = p_3_3*c3; // l=3,m=+3 + } + + // routine generated programmatically for evaluating SH basis for degree 4 + // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) + // output is vector b with SH basis evaluated at (x,y,z). + // + static void sh_eval_basis_4(REAL x,REAL y,REAL z,REAL b[25]) + { + const REAL z2 = z*z; + + + /* m=0 */ + + // l=0 + const REAL p_0_0 = CONSTANT(0.282094791773878140); + b[ 0] = p_0_0; // l=0,m=0 + // l=1 + const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; + b[ 2] = p_1_0; // l=1,m=0 + // l=2 + const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050); + b[ 6] = p_2_0; // l=2,m=0 + // l=3 + const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200)); + b[ 12] = p_3_0; // l=3,m=0 + // l=4 + const REAL p_4_0 = CONSTANT(1.984313483298443000)*z*p_3_0 + CONSTANT(-1.006230589874905300)*p_2_0; + b[ 20] = p_4_0; // l=4,m=0 + + + /* m=1 */ + + const REAL s1 = y; + const REAL c1 = x; + + // l=1 + const REAL p_1_1 = CONSTANT(-0.488602511902919920); + b[ 1] = p_1_1*s1; // l=1,m=-1 + b[ 3] = p_1_1*c1; // l=1,m=+1 + // l=2 + const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z; + b[ 5] = p_2_1*s1; // l=2,m=-1 + b[ 7] = p_2_1*c1; // l=2,m=+1 + // l=3 + const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770); + b[ 11] = p_3_1*s1; // l=3,m=-1 + b[ 13] = p_3_1*c1; // l=3,m=+1 + // l=4 + const REAL p_4_1 = z*(CONSTANT(-4.683325804901024000)*z2 + CONSTANT(2.007139630671867200)); + b[ 19] = p_4_1*s1; // l=4,m=-1 + b[ 21] = p_4_1*c1; // l=4,m=+1 + + + /* m=2 */ + + const REAL s2 = x*s1 + y*c1; + const REAL c2 = x*c1 - y*s1; + + // l=2 + const REAL p_2_2 = CONSTANT(0.546274215296039590); + b[ 4] = p_2_2*s2; // l=2,m=-2 + b[ 8] = p_2_2*c2; // l=2,m=+2 + // l=3 + const REAL p_3_2 = CONSTANT(1.445305721320277100)*z; + b[ 10] = p_3_2*s2; // l=3,m=-2 + b[ 14] = p_3_2*c2; // l=3,m=+2 + // l=4 + const REAL p_4_2 = CONSTANT(3.311611435151459800)*z2 + CONSTANT(-0.473087347878779980); + b[ 18] = p_4_2*s2; // l=4,m=-2 + b[ 22] = p_4_2*c2; // l=4,m=+2 + + + /* m=3 */ + + const REAL s3 = x*s2 + y*c2; + const REAL c3 = x*c2 - y*s2; + + // l=3 + const REAL p_3_3 = CONSTANT(-0.590043589926643520); + b[ 9] = p_3_3*s3; // l=3,m=-3 + b[ 15] = p_3_3*c3; // l=3,m=+3 + // l=4 + const REAL p_4_3 = CONSTANT(-1.770130769779930200)*z; + b[ 17] = p_4_3*s3; // l=4,m=-3 + b[ 23] = p_4_3*c3; // l=4,m=+3 + + + /* m=4 */ + + const REAL s4 = x*s3 + y*c3; + const REAL c4 = x*c3 - y*s3; + + // l=4 + const REAL p_4_4 = CONSTANT(0.625835735449176030); + b[ 16] = p_4_4*s4; // l=4,m=-4 + b[ 24] = p_4_4*c4; // l=4,m=+4 + } + + // routine generated programmatically for evaluating SH basis for degree 5 + // inputs (x,y,z) are a point on the sphere (i.e., must be unit length) + // output is vector b with SH basis evaluated at (x,y,z). + // + static void sh_eval_basis_5(REAL x,REAL y,REAL z,REAL b[36]) + { + const REAL z2 = z*z; + + + /* m=0 */ + + // l=0 + const REAL p_0_0 = CONSTANT(0.282094791773878140); + b[ 0] = p_0_0; // l=0,m=0 + // l=1 + const REAL p_1_0 = CONSTANT(0.488602511902919920)*z; + b[ 2] = p_1_0; // l=1,m=0 + // l=2 + const REAL p_2_0 = CONSTANT(0.946174695757560080)*z2 + CONSTANT(-0.315391565252520050); + b[ 6] = p_2_0; // l=2,m=0 + // l=3 + const REAL p_3_0 = z*(CONSTANT(1.865881662950577000)*z2 + CONSTANT(-1.119528997770346200)); + b[ 12] = p_3_0; // l=3,m=0 + // l=4 + const REAL p_4_0 = CONSTANT(1.984313483298443000)*z*p_3_0 + CONSTANT(-1.006230589874905300)*p_2_0; + b[ 20] = p_4_0; // l=4,m=0 + // l=5 + const REAL p_5_0 = CONSTANT(1.989974874213239700)*z*p_4_0 + CONSTANT(-1.002853072844814000)*p_3_0; + b[ 30] = p_5_0; // l=5,m=0 + + + /* m=1 */ + + const REAL s1 = y; + const REAL c1 = x; + + // l=1 + const REAL p_1_1 = CONSTANT(-0.488602511902919920); + b[ 1] = p_1_1*s1; // l=1,m=-1 + b[ 3] = p_1_1*c1; // l=1,m=+1 + // l=2 + const REAL p_2_1 = CONSTANT(-1.092548430592079200)*z; + b[ 5] = p_2_1*s1; // l=2,m=-1 + b[ 7] = p_2_1*c1; // l=2,m=+1 + // l=3 + const REAL p_3_1 = CONSTANT(-2.285228997322328800)*z2 + CONSTANT(0.457045799464465770); + b[ 11] = p_3_1*s1; // l=3,m=-1 + b[ 13] = p_3_1*c1; // l=3,m=+1 + // l=4 + const REAL p_4_1 = z*(CONSTANT(-4.683325804901024000)*z2 + CONSTANT(2.007139630671867200)); + b[ 19] = p_4_1*s1; // l=4,m=-1 + b[ 21] = p_4_1*c1; // l=4,m=+1 + // l=5 + const REAL p_5_1 = CONSTANT(2.031009601158990200)*z*p_4_1 + CONSTANT(-0.991031208965114650)*p_3_1; + b[ 29] = p_5_1*s1; // l=5,m=-1 + b[ 31] = p_5_1*c1; // l=5,m=+1 + + + /* m=2 */ + + const REAL s2 = x*s1 + y*c1; + const REAL c2 = x*c1 - y*s1; + + // l=2 + const REAL p_2_2 = CONSTANT(0.546274215296039590); + b[ 4] = p_2_2*s2; // l=2,m=-2 + b[ 8] = p_2_2*c2; // l=2,m=+2 + // l=3 + const REAL p_3_2 = CONSTANT(1.445305721320277100)*z; + b[ 10] = p_3_2*s2; // l=3,m=-2 + b[ 14] = p_3_2*c2; // l=3,m=+2 + // l=4 + const REAL p_4_2 = CONSTANT(3.311611435151459800)*z2 + CONSTANT(-0.473087347878779980); + b[ 18] = p_4_2*s2; // l=4,m=-2 + b[ 22] = p_4_2*c2; // l=4,m=+2 + // l=5 + const REAL p_5_2 = z*(CONSTANT(7.190305177459987500)*z2 + CONSTANT(-2.396768392486662100)); + b[ 28] = p_5_2*s2; // l=5,m=-2 + b[ 32] = p_5_2*c2; // l=5,m=+2 + + + /* m=3 */ + + const REAL s3 = x*s2 + y*c2; + const REAL c3 = x*c2 - y*s2; + + // l=3 + const REAL p_3_3 = CONSTANT(-0.590043589926643520); + b[ 9] = p_3_3*s3; // l=3,m=-3 + b[ 15] = p_3_3*c3; // l=3,m=+3 + // l=4 + const REAL p_4_3 = CONSTANT(-1.770130769779930200)*z; + b[ 17] = p_4_3*s3; // l=4,m=-3 + b[ 23] = p_4_3*c3; // l=4,m=+3 + // l=5 + const REAL p_5_3 = CONSTANT(-4.403144694917253700)*z2 + CONSTANT(0.489238299435250430); + b[ 27] = p_5_3*s3; // l=5,m=-3 + b[ 33] = p_5_3*c3; // l=5,m=+3 + + + /* m=4 */ + + const REAL s4 = x*s3 + y*c3; + const REAL c4 = x*c3 - y*s3; + + // l=4 + const REAL p_4_4 = CONSTANT(0.625835735449176030); + b[ 16] = p_4_4*s4; // l=4,m=-4 + b[ 24] = p_4_4*c4; // l=4,m=+4 + // l=5 + const REAL p_5_4 = CONSTANT(2.075662314881041100)*z; + b[ 26] = p_5_4*s4; // l=5,m=-4 + b[ 34] = p_5_4*c4; // l=5,m=+4 + + + /* m=5 */ + + const REAL s5 = x*s4 + y*c4; + const REAL c5 = x*c4 - y*s4; + + // l=5 + const REAL p_5_5 = CONSTANT(-0.656382056840170150); + b[ 25] = p_5_5*s5; // l=5,m=-5 + b[ 35] = p_5_5*c5; // l=5,m=+5 + } + + static const REAL M_PIjs = (REAL) (4.0*atan(1.0)); + static const REAL maxang = (REAL) (M_PIjs/2); + static const int NSH0 = 1; + static const int NSH1 = 4; + static const int NSH2 = 9; + static const int NSH3 = 16; + static const int NSH4 = 25; + static const int NSH5 = 36; + static const int NSH6 = 49; + static const int NSH7 = 64; + static const int NSH8 = 81; + static const int NSH9 = 100; + static const int NL0 = 1; + static const int NL1 = 3; + static const int NL2 = 5; + static const int NL3 = 7; + static const int NL4 = 9; + static const int NL5 = 11; + static const int NL6 = 13; + static const int NL7 = 15; + static const int NL8 = 17; + static const int NL9 = 19; + + static inline void rot(REAL ct,REAL st,REAL x,REAL y,REAL &xout,REAL &yout) + { + xout = x*ct - y*st; + yout = y*ct + x*st; + } + + static inline void rot_inv(REAL ct,REAL st,REAL x,REAL y,REAL &xout,REAL &yout) + { + xout = x*ct + y*st; + yout = y*ct - x*st; + } + + static inline void rot_1(REAL ct,REAL st,REAL ctm[1],REAL stm[1]) + { + ctm[0] = ct; + stm[0] = st; + } + + static inline void rot_2(REAL ct,REAL st,REAL ctm[2],REAL stm[2]) + { + REAL ct2 = CONSTANT(2.0)*ct; + ctm[0] = ct; + stm[0] = st; + ctm[1] = ct2*ct-CONSTANT(1.0); + stm[1] = ct2*st; + } + + static inline void rot_3(REAL ct,REAL st,REAL ctm[3],REAL stm[3]) + { + REAL ct2 = CONSTANT(2.0)*ct; + ctm[0] = ct; + stm[0] = st; + ctm[1] = ct2*ct-CONSTANT(1.0); + stm[1] = ct2*st; + ctm[2] = ct2*ctm[1] - ct; + stm[2] = ct2*stm[1] - st; + } + + static inline void rot_4(REAL ct,REAL st,REAL ctm[4],REAL stm[4]) + { + REAL ct2 = CONSTANT(2.0)*ct; + ctm[0] = ct; + stm[0] = st; + ctm[1] = ct2*ct-CONSTANT(1.0); + stm[1] = ct2*st; + ctm[2] = ct2*ctm[1] - ct; + stm[2] = ct2*stm[1] - st; + ctm[3] = ct2*ctm[2] - ctm[1]; + stm[3] = ct2*stm[2] - stm[1]; + } + + static inline void rot_5(REAL ct,REAL st,REAL ctm[5],REAL stm[5]) + { + REAL ct2 = CONSTANT(2.0)*ct; + ctm[0] = ct; + stm[0] = st; + ctm[1] = ct2*ct-CONSTANT(1.0); + stm[1] = ct2*st; + ctm[2] = ct2*ctm[1] - ct; + stm[2] = ct2*stm[1] - st; + ctm[3] = ct2*ctm[2] - ctm[1]; + stm[3] = ct2*stm[2] - stm[1]; + ctm[4] = ct2*ctm[3] - ctm[2]; + stm[4] = ct2*stm[3] - stm[2]; + } + + static inline void sh_rotz_1(REAL ctm[1],REAL stm[1],REAL y[NL1],REAL yr[NL1]) + { + yr[1] = y[1]; + rot_inv(ctm[0],stm[0],y[0],y[2],yr[0],yr[2]); + } + + static inline void sh_rotz_2(REAL ctm[2],REAL stm[2],REAL y[NL2],REAL yr[NL2]) + { + yr[2] = y[2]; + rot_inv(ctm[0],stm[0],y[1],y[3],yr[1],yr[3]); + rot_inv(ctm[1],stm[1],y[0],y[4],yr[0],yr[4]); + } + + static inline void sh_rotz_3(REAL ctm[3],REAL stm[3],REAL y[NL3],REAL yr[NL3]) + { + yr[3] = y[3]; + rot_inv(ctm[0],stm[0],y[2],y[4],yr[2],yr[4]); + rot_inv(ctm[1],stm[1],y[1],y[5],yr[1],yr[5]); + rot_inv(ctm[2],stm[2],y[0],y[6],yr[0],yr[6]); + } + + static inline void sh_rotz_4(REAL ctm[4],REAL stm[4],REAL y[NL4],REAL yr[NL4]) + { + yr[4] = y[4]; + rot_inv(ctm[0],stm[0],y[3],y[5],yr[3],yr[5]); + rot_inv(ctm[1],stm[1],y[2],y[6],yr[2],yr[6]); + rot_inv(ctm[2],stm[2],y[1],y[7],yr[1],yr[7]); + rot_inv(ctm[3],stm[3],y[0],y[8],yr[0],yr[8]); + } + + static inline void sh_rotz_5(REAL ctm[5],REAL stm[5],REAL y[NL5],REAL yr[NL5]) + { + yr[5] = y[5]; + rot_inv(ctm[0],stm[0],y[4],y[6],yr[4],yr[6]); + rot_inv(ctm[1],stm[1],y[3],y[7],yr[3],yr[7]); + rot_inv(ctm[2],stm[2],y[2],y[8],yr[2],yr[8]); + rot_inv(ctm[3],stm[3],y[1],y[9],yr[1],yr[9]); + rot_inv(ctm[4],stm[4],y[0],y[10],yr[0],yr[10]); + } + + // rotation code generated programmatically by rotatex (2000x4000 samples, eps=1e-008) + + static REAL fx_1_001 = (REAL) ( sqrt(1.0)/1.0); // 1 + static REAL fx_1_002 = (REAL) (-sqrt(1.0)/1.0); // -1.00000030843 + + static inline void sh_rotx90_1(REAL y[],REAL yr[]) + { + yr[ 0] = fx_1_001*y[ 1]; + yr[ 1] = fx_1_002*y[ 0]; + yr[ 2] = fx_1_001*y[ 2]; + }; + + static inline void sh_rotx90_inv_1(REAL y[],REAL yr[]) + { + yr[ 0] = fx_1_002*y[ 1]; + yr[ 1] = fx_1_001*y[ 0]; + yr[ 2] = fx_1_001*y[ 2]; + } + + static REAL fx_2_001 = (REAL) ( sqrt(4.0)/2.0); // 1 + static REAL fx_2_002 = (REAL) (-sqrt(4.0)/2.0); // -1 + static REAL fx_2_003 = (REAL) (-sqrt(1.0)/2.0); // -0.500000257021 + static REAL fx_2_004 = (REAL) (-sqrt(3.0)/2.0); // -0.866025848959 + static REAL fx_2_005 = (REAL) ( sqrt(1.0)/2.0); // 0.5 + + static inline void sh_rotx90_2(REAL y[],REAL yr[]) + { + yr[ 0] = fx_2_001*y[ 3]; + yr[ 1] = fx_2_002*y[ 1]; + yr[ 2] = fx_2_003*y[ 2]+fx_2_004*y[ 4]; + yr[ 3] = fx_2_002*y[ 0]; + yr[ 4] = fx_2_004*y[ 2]+fx_2_005*y[ 4]; + }; + + static inline void sh_rotx90_inv_2(REAL y[],REAL yr[]) + { + yr[ 0] = fx_2_002*y[ 3]; + yr[ 1] = fx_2_002*y[ 1]; + yr[ 2] = fx_2_003*y[ 2]+fx_2_004*y[ 4]; + yr[ 3] = fx_2_001*y[ 0]; + yr[ 4] = fx_2_004*y[ 2]+fx_2_005*y[ 4]; + } + + static REAL fx_3_001 = (REAL) (-sqrt(10.0)/4.0); // -0.790569415042 + static REAL fx_3_002 = (REAL) ( sqrt(6.0)/4.0); // 0.612372435696 + static REAL fx_3_003 = (REAL) (-sqrt(16.0)/4.0); // -1 + static REAL fx_3_004 = (REAL) (-sqrt(6.0)/4.0); // -0.612372435695 + static REAL fx_3_005 = (REAL) (-sqrt(1.0)/4.0); // -0.25 + static REAL fx_3_006 = (REAL) (-sqrt(15.0)/4.0); // -0.968245836551 + static REAL fx_3_007 = (REAL) ( sqrt(1.0)/4.0); // 0.25 + static REAL fx_3_008 = (REAL) ( sqrt(10.0)/4.0); // 0.790569983984 + + static inline void sh_rotx90_3(REAL y[],REAL yr[]) + { + yr[ 0] = fx_3_001*y[ 3]+fx_3_002*y[ 5]; + yr[ 1] = fx_3_003*y[ 1]; + yr[ 2] = fx_3_004*y[ 3]+fx_3_001*y[ 5]; + yr[ 3] = fx_3_008*y[ 0]+fx_3_002*y[ 2]; + yr[ 4] = fx_3_005*y[ 4]+fx_3_006*y[ 6]; + yr[ 5] = fx_3_004*y[ 0]-fx_3_001*y[ 2]; + yr[ 6] = fx_3_006*y[ 4]+fx_3_007*y[ 6]; + }; + + static inline void sh_rotx90_inv_3(REAL y[],REAL yr[]) + { + yr[ 0] = fx_3_008*y[ 3]+fx_3_004*y[ 5]; + yr[ 1] = fx_3_003*y[ 1]; + yr[ 2] = fx_3_002*y[ 3]-fx_3_001*y[ 5]; + yr[ 3] = fx_3_001*y[ 0]+fx_3_004*y[ 2]; + yr[ 4] = fx_3_005*y[ 4]+fx_3_006*y[ 6]; + yr[ 5] = fx_3_002*y[ 0]+fx_3_001*y[ 2]; + yr[ 6] = fx_3_006*y[ 4]+fx_3_007*y[ 6]; + } + + static REAL fx_4_001 = (REAL) (-sqrt(56.0)/8.0); // -0.935414346694 + static REAL fx_4_002 = (REAL) ( sqrt(8.0)/8.0); // 0.353553390593 + static REAL fx_4_003 = (REAL) (-sqrt(36.0)/8.0); // -0.75 + static REAL fx_4_004 = (REAL) ( sqrt(28.0)/8.0); // 0.661437827766 + static REAL fx_4_005 = (REAL) (-sqrt(8.0)/8.0); // -0.353553390593 + static REAL fx_4_006 = (REAL) ( sqrt(36.0)/8.0); // 0.749999999999 + static REAL fx_4_007 = (REAL) ( sqrt(9.0)/8.0); // 0.37500034698 + static REAL fx_4_008 = (REAL) ( sqrt(20.0)/8.0); // 0.559017511622 + static REAL fx_4_009 = (REAL) ( sqrt(35.0)/8.0); // 0.739510657141 + static REAL fx_4_010 = (REAL) ( sqrt(16.0)/8.0); // 0.5 + static REAL fx_4_011 = (REAL) (-sqrt(28.0)/8.0); // -0.661437827766 + static REAL fx_4_012 = (REAL) ( sqrt(1.0)/8.0); // 0.125 + static REAL fx_4_013 = (REAL) ( sqrt(56.0)/8.0); // 0.935414346692 + + static inline void sh_rotx90_4(REAL y[],REAL yr[]) + { + yr[ 0] = fx_4_001*y[ 5]+fx_4_002*y[ 7]; + yr[ 1] = fx_4_003*y[ 1]+fx_4_004*y[ 3]; + yr[ 2] = fx_4_005*y[ 5]+fx_4_001*y[ 7]; + yr[ 3] = fx_4_004*y[ 1]+fx_4_006*y[ 3]; + yr[ 4] = fx_4_007*y[ 4]+fx_4_008*y[ 6]+fx_4_009*y[ 8]; + yr[ 5] = fx_4_013*y[ 0]+fx_4_002*y[ 2]; + yr[ 6] = fx_4_008*y[ 4]+fx_4_010*y[ 6]+fx_4_011*y[ 8]; + yr[ 7] = fx_4_005*y[ 0]-fx_4_001*y[ 2]; + yr[ 8] = fx_4_009*y[ 4]+fx_4_011*y[ 6]+fx_4_012*y[ 8]; + }; + + static inline void sh_rotx90_inv_4(REAL y[],REAL yr[]) + { + yr[ 0] = fx_4_013*y[ 5]+fx_4_005*y[ 7]; + yr[ 1] = fx_4_003*y[ 1]+fx_4_004*y[ 3]; + yr[ 2] = fx_4_002*y[ 5]-fx_4_001*y[ 7]; + yr[ 3] = fx_4_004*y[ 1]+fx_4_006*y[ 3]; + yr[ 4] = fx_4_007*y[ 4]+fx_4_008*y[ 6]+fx_4_009*y[ 8]; + yr[ 5] = fx_4_001*y[ 0]+fx_4_005*y[ 2]; + yr[ 6] = fx_4_008*y[ 4]+fx_4_010*y[ 6]+fx_4_011*y[ 8]; + yr[ 7] = fx_4_002*y[ 0]+fx_4_001*y[ 2]; + yr[ 8] = fx_4_009*y[ 4]+fx_4_011*y[ 6]+fx_4_012*y[ 8]; + } + + static REAL fx_5_001 = (REAL) ( sqrt(126.0)/16.0); // 0.70156076002 + static REAL fx_5_002 = (REAL) (-sqrt(120.0)/16.0); // -0.684653196882 + static REAL fx_5_003 = (REAL) ( sqrt(10.0)/16.0); // 0.197642353761 + static REAL fx_5_004 = (REAL) (-sqrt(64.0)/16.0); // -0.5 + static REAL fx_5_005 = (REAL) ( sqrt(192.0)/16.0); // 0.866025403784 + static REAL fx_5_006 = (REAL) ( sqrt(70.0)/16.0); // 0.522912516584 + static REAL fx_5_007 = (REAL) ( sqrt(24.0)/16.0); // 0.306186217848 + static REAL fx_5_008 = (REAL) (-sqrt(162.0)/16.0); // -0.795495128835 + static REAL fx_5_009 = (REAL) ( sqrt(64.0)/16.0); // 0.5 + static REAL fx_5_010 = (REAL) ( sqrt(60.0)/16.0); // 0.484122918274 + static REAL fx_5_011 = (REAL) ( sqrt(112.0)/16.0); // 0.661437827763 + static REAL fx_5_012 = (REAL) ( sqrt(84.0)/16.0); // 0.572821961867 + static REAL fx_5_013 = (REAL) ( sqrt(4.0)/16.0); // 0.125 + static REAL fx_5_014 = (REAL) ( sqrt(42.0)/16.0); // 0.405046293649 + static REAL fx_5_015 = (REAL) ( sqrt(210.0)/16.0); // 0.905711046633 + static REAL fx_5_016 = (REAL) ( sqrt(169.0)/16.0); // 0.8125 + static REAL fx_5_017 = (REAL) (-sqrt(45.0)/16.0); // -0.419262745781 + static REAL fx_5_018 = (REAL) ( sqrt(1.0)/16.0); // 0.0625 + static REAL fx_5_019 = (REAL) (-sqrt(126.0)/16.0); // -0.701561553415 + static REAL fx_5_020 = (REAL) ( sqrt(120.0)/16.0); // 0.684653196881 + static REAL fx_5_021 = (REAL) (-sqrt(10.0)/16.0); // -0.197642353761 + static REAL fx_5_022 = (REAL) (-sqrt(70.0)/16.0); // -0.522913107945 + static REAL fx_5_023 = (REAL) (-sqrt(60.0)/16.0); // -0.48412346577 + + static inline void sh_rotx90_5(REAL y[],REAL yr[]) + { + yr[ 0] = fx_5_001*y[ 5]+fx_5_002*y[ 7]+fx_5_003*y[ 9]; + yr[ 1] = fx_5_004*y[ 1]+fx_5_005*y[ 3]; + yr[ 2] = fx_5_006*y[ 5]+fx_5_007*y[ 7]+fx_5_008*y[ 9]; + yr[ 3] = fx_5_005*y[ 1]+fx_5_009*y[ 3]; + yr[ 4] = fx_5_010*y[ 5]+fx_5_011*y[ 7]+fx_5_012*y[ 9]; + yr[ 5] = fx_5_019*y[ 0]+fx_5_022*y[ 2]+fx_5_023*y[ 4]; + yr[ 6] = fx_5_013*y[ 6]+fx_5_014*y[ 8]+fx_5_015*y[ 10]; + yr[ 7] = fx_5_020*y[ 0]-fx_5_007*y[ 2]-fx_5_011*y[ 4]; + yr[ 8] = fx_5_014*y[ 6]+fx_5_016*y[ 8]+fx_5_017*y[ 10]; + yr[ 9] = fx_5_021*y[ 0]-fx_5_008*y[ 2]-fx_5_012*y[ 4]; + yr[ 10] = fx_5_015*y[ 6]+fx_5_017*y[ 8]+fx_5_018*y[ 10]; + }; + + static inline void sh_rotx90_inv_5(REAL y[],REAL yr[]) + { + yr[ 0] = fx_5_019*y[ 5]+fx_5_020*y[ 7]+fx_5_021*y[ 9]; + yr[ 1] = fx_5_004*y[ 1]+fx_5_005*y[ 3]; + yr[ 2] = fx_5_022*y[ 5]-fx_5_007*y[ 7]-fx_5_008*y[ 9]; + yr[ 3] = fx_5_005*y[ 1]+fx_5_009*y[ 3]; + yr[ 4] = fx_5_023*y[ 5]-fx_5_011*y[ 7]-fx_5_012*y[ 9]; + yr[ 5] = fx_5_001*y[ 0]+fx_5_006*y[ 2]+fx_5_010*y[ 4]; + yr[ 6] = fx_5_013*y[ 6]+fx_5_014*y[ 8]+fx_5_015*y[ 10]; + yr[ 7] = fx_5_002*y[ 0]+fx_5_007*y[ 2]+fx_5_011*y[ 4]; + yr[ 8] = fx_5_014*y[ 6]+fx_5_016*y[ 8]+fx_5_017*y[ 10]; + yr[ 9] = fx_5_003*y[ 0]+fx_5_008*y[ 2]+fx_5_012*y[ 4]; + yr[ 10] = fx_5_015*y[ 6]+fx_5_017*y[ 8]+fx_5_018*y[ 10]; + } + + static inline void sh_rot_1(REAL m[3*3],REAL y[NL1],REAL yr[NL1]) + { + REAL yr0 = m[4]*y[0] - m[5]*y[1] + m[3]*y[2]; + REAL yr1 = m[8]*y[1] - m[7]*y[0] - m[6]*y[2]; + REAL yr2 = m[1]*y[0] - m[2]*y[1] + m[0]*y[2]; + + yr[0] = yr0; + yr[1] = yr1; + yr[2] = yr2; + } + + static inline void sh_roty_1(REAL ctm[1],REAL stm[1],REAL y[NL1],REAL yr[NL1]) + { + yr[0] = y[0]; + rot_inv(ctm[0],stm[0],y[1],y[2],yr[1],yr[2]); + } + + static inline void sh_roty_2(REAL ctm[2],REAL stm[2],REAL y[NL2],REAL yr[NL2]) + { + REAL ytmp[NL2]; + sh_rotx90_2(y,yr); + sh_rotz_2(ctm,stm,yr,ytmp); + sh_rotx90_inv_2(ytmp,yr); + } + + static inline void sh_roty_3(REAL ctm[3],REAL stm[3],REAL y[NL3],REAL yr[NL3]) + { + REAL ytmp[NL3]; + sh_rotx90_3(y,yr); + sh_rotz_3(ctm,stm,yr,ytmp); + sh_rotx90_inv_3(ytmp,yr); + } + + static inline void sh_roty_4(REAL ctm[4],REAL stm[4],REAL y[NL4],REAL yr[NL4]) + { + REAL ytmp[NL4]; + sh_rotx90_4(y,yr); + sh_rotz_4(ctm,stm,yr,ytmp); + sh_rotx90_inv_4(ytmp,yr); + } + + static inline void sh_roty_5(REAL ctm[5],REAL stm[5],REAL y[NL5],REAL yr[NL5]) + { + REAL ytmp[NL5]; + sh_rotx90_5(y,yr); + sh_rotz_5(ctm,stm,yr,ytmp); + sh_rotx90_inv_5(ytmp,yr); + } + + #define ROT_TOL CONSTANT(1e-4) + + /* + Finds cosine,sine pairs for zyz rotation (i.e. rotation R_z2 R_y R_z1 v). + The rotation is one which maps mx to (1,0,0) and mz to (0,0,1). + */ + static inline void zyz(REAL m[3*3],REAL &zc1,REAL &zs1,REAL &yc,REAL &ys,REAL &zc2,REAL &zs2) + { + REAL cz = m[8]; + + // rotate so that (cx,cy,0) aligns to (1,0,0) + REAL cxylen = (REAL) sqrtf(1.0f - cz*cz); + if (cxylen >= ROT_TOL) + { + // if above is a NaN, will do the correct thing + yc = cz; + ys = cxylen; + REAL len67inv = 1.0f/sqrtf(m[6]*m[6] + m[7]*m[7]); + zc1 = -m[6]*len67inv; + zs1 = m[7]*len67inv; + REAL len25inv = 1.0f/sqrtf(m[2]*m[2] + m[5]*m[5]); + zc2 = m[2]*len25inv; + zs2 = m[5]*len25inv; + } else { // m[6],m[7],m[8] already aligned to (0,0,1) + zc1 = 1.0; zs1 = 0.0; // identity + yc = cz; ys = 0.0; // identity + zc2 = m[0]*cz; zs2 = -m[1]; // align x axis (mx[0],mx[1],0) to (1,0,0) + } + } + + static inline void sh_rotzyz_2(REAL zc1m[2],REAL zs1m[2],REAL ycm[2],REAL ysm[2],REAL zc2m[2],REAL zs2m[2],REAL y[NL2],REAL yr[NL2]) + { + REAL ytmp[NL2]; + sh_rotz_2(zc1m,zs1m,y,yr); + sh_roty_2(ycm,ysm,yr,ytmp); + sh_rotz_2(zc2m,zs2m,ytmp,yr); + } + + static inline void sh_rotzyz_3(REAL zc1m[3],REAL zs1m[3],REAL ycm[3],REAL ysm[3],REAL zc2m[3],REAL zs2m[3],REAL y[NL3],REAL yr[NL3]) + { + REAL ytmp[NL3]; + sh_rotz_3(zc1m,zs1m,y,yr); + sh_roty_3(ycm,ysm,yr,ytmp); + sh_rotz_3(zc2m,zs2m,ytmp,yr); + } + + static inline void sh_rotzyz_4(REAL zc1m[4],REAL zs1m[4],REAL ycm[4],REAL ysm[4],REAL zc2m[4],REAL zs2m[4],REAL y[NL4],REAL yr[NL4]) + { + REAL ytmp[NL4]; + sh_rotz_4(zc1m,zs1m,y,yr); + sh_roty_4(ycm,ysm,yr,ytmp); + sh_rotz_4(zc2m,zs2m,ytmp,yr); + } + + static inline void sh_rotzyz_5(REAL zc1m[5],REAL zs1m[5],REAL ycm[5],REAL ysm[5],REAL zc2m[5],REAL zs2m[5],REAL y[NL5],REAL yr[NL5]) + { + REAL ytmp[NL5]; + sh_rotz_5(zc1m,zs1m,y,yr); + sh_roty_5(ycm,ysm,yr,ytmp); + sh_rotz_5(zc2m,zs2m,ytmp,yr); + } + + static inline void sh3_rot(REAL m[3*3],REAL zc1,REAL zs1,REAL yc,REAL ys,REAL zc2,REAL zs2,REAL y[NSH3],REAL yr[NSH3]) + { + REAL zc1m[3],zs1m[3]; + rot_3(zc1,zs1,zc1m,zs1m); + REAL ycm[3],ysm[3]; + rot_3(yc,ys,ycm,ysm); + REAL zc2m[3],zs2m[3]; + rot_3(zc2,zs2,zc2m,zs2m); + + yr[0] = y[0]; + sh_rot_1(m,y+NSH0,yr+NSH0); + sh_rotzyz_2(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH1,yr+NSH1); + sh_rotzyz_3(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH2,yr+NSH2); + } + + static inline void sh4_rot(REAL m[3*3],REAL zc1,REAL zs1,REAL yc,REAL ys,REAL zc2,REAL zs2,REAL y[NSH4],REAL yr[NSH4]) + { + REAL zc1m[4],zs1m[4]; + rot_4(zc1,zs1,zc1m,zs1m); + REAL ycm[4],ysm[4]; + rot_4(yc,ys,ycm,ysm); + REAL zc2m[4],zs2m[4]; + rot_4(zc2,zs2,zc2m,zs2m); + + yr[0] = y[0]; + sh_rot_1(m,y+NSH0,yr+NSH0); + sh_rotzyz_2(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH1,yr+NSH1); + sh_rotzyz_3(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH2,yr+NSH2); + sh_rotzyz_4(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH3,yr+NSH3); + } + + static inline void sh5_rot(REAL m[3*3],REAL zc1,REAL zs1,REAL yc,REAL ys,REAL zc2,REAL zs2,REAL y[NSH5],REAL yr[NSH5]) + { + REAL zc1m[5],zs1m[5]; + rot_5(zc1,zs1,zc1m,zs1m); + REAL ycm[5],ysm[5]; + rot_5(yc,ys,ycm,ysm); + REAL zc2m[5],zs2m[5]; + rot_5(zc2,zs2,zc2m,zs2m); + + yr[0] = y[0]; + sh_rot_1(m,y+NSH0,yr+NSH0); + sh_rotzyz_2(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH1,yr+NSH1); + sh_rotzyz_3(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH2,yr+NSH2); + sh_rotzyz_4(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH3,yr+NSH3); + sh_rotzyz_5(zc1m,zs1m,ycm,ysm,zc2m,zs2m,y+NSH4,yr+NSH4); + } + + inline void sh1_rot(REAL m[3*3],REAL y[NSH1],REAL yr[NSH1]) + { + yr[0] = y[0]; + sh_rot_1(m,y+NSH0,yr+NSH0); + } + + inline void sh3_rot(REAL m[3*3],REAL y[NSH3],REAL yr[NSH3]) + { + REAL zc1,zs1,yc,ys,zc2,zs2; + zyz(m,zc1,zs1,yc,ys,zc2,zs2); + sh3_rot(m,zc1,zs1,yc,ys,zc2,zs2,y,yr); + } + + inline void sh4_rot(REAL m[3*3],REAL y[NSH4],REAL yr[NSH4]) + { + REAL zc1,zs1,yc,ys,zc2,zs2; + zyz(m,zc1,zs1,yc,ys,zc2,zs2); + sh4_rot(m,zc1,zs1,yc,ys,zc2,zs2,y,yr); + } + + inline void sh5_rot(REAL m[3*3],REAL y[NSH5],REAL yr[NSH5]) + { + REAL zc1,zs1,yc,ys,zc2,zs2; + zyz(m,zc1,zs1,yc,ys,zc2,zs2); + sh5_rot(m,zc1,zs1,yc,ys,zc2,zs2,y,yr); + } + + // simple matrix vector multiply for a square matrix (only used by ZRotation) + static inline void SimpMatMul(size_t dim, const float *matrix, const float *input, float *result) + { + for(size_t iR=0; iR < dim; ++iR) + { + result[iR + 0] = matrix[iR*dim + 0] * input[0]; + for(size_t iC=1; iC < dim; ++iC) + { + result[iR] += matrix[iR*dim+ iC] * input[iC]; + } + } + } + +}; // anonymous namespace + + +namespace DirectX +{ + +//------------------------------------------------------------------------------------- +// Evaluates the Spherical Harmonic basis functions +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205448.aspx +//------------------------------------------------------------------------------------- +float* XM_CALLCONV XMSHEvalDirection( _Out_writes_(order*order) float *result, + _In_ size_t order, + _In_ FXMVECTOR dir ) +{ + if ( !result ) + return nullptr; + + XMFLOAT4A dv; + XMStoreFloat4A( &dv, dir ); + + const float fX = dv.x; + const float fY = dv.y; + const float fZ = dv.z; + + switch( order ) + { + case 2: + sh_eval_basis_1(fX,fY,fZ,result); + break; + + case 3: + sh_eval_basis_2(fX,fY,fZ,result); + break; + + case 4: + sh_eval_basis_3(fX,fY,fZ,result); + break; + + case 5: + sh_eval_basis_4(fX,fY,fZ,result); + break; + + case 6: + sh_eval_basis_5(fX,fY,fZ,result); + break; + + default: + assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ); + return nullptr; + } + + return result; +} + + +//------------------------------------------------------------------------------------- +// Rotates SH vector by a rotation matrix +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204992.aspx +//------------------------------------------------------------------------------------- +float* XM_CALLCONV XMSHRotate( _Out_writes_(order*order) float *result, + _In_ size_t order, + _In_ FXMMATRIX rotMatrix, + _In_reads_(order*order) const float *input ) +{ + if ( !result || !input ) + return nullptr; + + if( result == input ) + return nullptr; + + XMFLOAT3X3 mat; + XMStoreFloat3x3( &mat, rotMatrix ); + + float mRot[3*3]; + const float r00 = mRot[0*3 +0] = mat._11; + const float r10 = mRot[1*3 +0] = mat._12; + const float r20 = mRot[2*3 +0] = mat._13; + + const float r01 = mRot[0*3 +1] = mat._21; + const float r11 = mRot[1*3 +1] = mat._22; + const float r21 = mRot[2*3 +1] = mat._23; + + const float r02 = mRot[0*3 +2] = mat._31; + const float r12 = mRot[1*3 +2] = mat._32; + const float r22 = mRot[2*3 +2] = mat._33; + + result[0] = input[0]; // rotate the constant term + + switch (order) + { + case 2: + { + // do linear by hand... + + result[1] = r11*input[1] - r12*input[2] + r10*input[3]; + result[2] = -r21*input[1] + r22*input[2] - r20*input[3]; + result[3] = r01*input[1] -r02*input[2] + r00*input[3]; + } + break; + + case 3: + { + float R[25]; + // do linear by hand... + + result[1] = r11*input[1] - r12*input[2] + r10*input[3]; + result[2] = -r21*input[1] + r22*input[2] - r20*input[3]; + result[3] = r01*input[1] -r02*input[2] + r00*input[3]; + + // direct code for quadratics is faster than ZYZ reccurence relations + + const float t41 = r01 * r00; + const float t43 = r11 * r10; + const float t48 = r11 * r12; + const float t50 = r01 * r02; + const float t55 = r02 * r02; + const float t57 = r22 * r22; + const float t58 = r12 * r12; + const float t61 = r00 * r02; + const float t63 = r10 * r12; + const float t68 = r10 * r10; + const float t70 = r01 * r01; + const float t72 = r11 * r11; + const float t74 = r00 * r00; + const float t76 = r21 * r21; + const float t78 = r20 * r20; + + const float v173 = 0.1732050808e1f; + const float v577 = 0.5773502693e0f; + const float v115 = 0.1154700539e1f; + const float v288 = 0.2886751347e0f; + const float v866 = 0.8660254040e0f; + + R[0] = r11 * r00 + r01 * r10; + R[1] = - r01 * r12 - r11 * r02; + R[2] = v173 * r02 * r12; + R[3] = - r10 * r02 - r00 * r12; + R[4] = r00 * r10 - r01 * r11; + R[5] = - r11 * r20 - r21 * r10; + R[6] = r11 * r22 + r21 * r12; + R[7] = -v173 * r22 * r12; + R[8] = r20 * r12 + r10 * r22; + R[9] = - r10 * r20 + r11 * r21; + R[10] = - v577* (t41 + t43) + v115 * r21 * r20; + R[11] = v577* (t48 + t50) - v115 * r21 * r22; + R[12] = -0.5000000000e0f * (t55 + t58) + t57; + R[13] = v577 * (t61 + t63) - v115 * r20 * r22; + R[14] = v288 * (t70 - t68 + t72 - t74) - v577 * (t76 - t78); + R[15] = - r01 * r20 - r21 * r00; + R[16] = r01 * r22 + r21 * r02; + R[17] = -v173 * r22 * r02; + R[18] = r00 * r22 + r20 * r02; + R[19] = - r00 * r20 + r01 * r21; + R[20] = t41 - t43; + R[21] = - t50 + t48; + R[22] = v866 * (t55 - t58); + R[23] = t63 - t61; + R[24] = 0.5000000000e0f *( t74 - t68 - t70 + t72); + + // blow the matrix multiply out by hand, looping is ineficient on a P4... + for(unsigned int iR=0; iR<5;iR++) + { + const unsigned int uBase = iR*5; + result[4 + iR] = R[uBase + 0]*input[4] + R[uBase + 1]*input[5] + R[uBase + 2]*input[6] + R[uBase + 3]*input[7] + R[uBase + 4]*input[8]; + } + } + break; + + case 4: + sh3_rot(mRot,const_cast(input),result); + break; + + case 5: + sh4_rot(mRot,const_cast(input),result); + break; + + case 6: + sh5_rot(mRot,const_cast(input),result); + break; + + default: + assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ); + return nullptr; + } + + return result; +} + + +//------------------------------------------------------------------------------------- +// Rotates the SH vector in the Z axis by an angle +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205461.aspx +//------------------------------------------------------------------------------------- +float* XMSHRotateZ( _Out_writes_(order*order) float *result, + _In_ size_t order, + _In_ float angle, + _In_reads_(order*order) const float *input ) +{ + if ( !result || !input ) + return nullptr; + + if( result == input ) + return nullptr; + + if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ) + return nullptr; + + float R[(2*(XM_SH_MAXORDER-1) + 1)*(2* (XM_SH_MAXORDER-1) + 1)]; // used to store rotation matrices... + + // these are actually very sparse matrices, most of the entries are zero's... + + const float ca = cosf(angle); + const float sa = sinf(angle); + + const float t1 = ca; + const float t2 = sa; + R[0] = t1; + R[1] = 0.0f; + R[2] = t2; + R[3] = 0.0f; + R[4] = 1.0f; + R[5] = 0.0f; + R[6] = -t2; + R[7] = 0.0f; + R[8] = t1; + + result[0] = input[0]; + SimpMatMul(3,R,input+1,result+1); + + if (order > 2) + { + for(int j=0;j<5*5;j++) R[j] = 0.0f; + const float t1 = sa; + const float t2 = t1*t1; + const float t3 = ca; + const float t4 = t3*t3; + const float t5 = -t2+t4; + const float t7 = 2.0f*t3*t1; + R[0] = t5; + R[4] = t7; + R[6] = t3; + R[8] = t1; + R[12] = 1.0f; + R[16] = -t1; + R[18] = t3; + R[20] = -t7; + R[24] = t5; + + SimpMatMul(5,R,input+4,result+4); // un-roll matrix/vector multiply + if (order > 3) + { + for(int j=0;j<7*7;j++) R[j] = 0.0f; + const float t1 = ca; + const float t2 = t1*t1; + const float t4 = sa; + const float t5 = t4*t4; + const float t8 = t2*t1-3.0f*t1*t5; + const float t12 = 3.0f*t4*t2-t5*t4; + const float t13 = -t5+t2; + const float t15 = 2.0f*t1*t4; + R[0] = t8; + R[6] = t12; + R[8] = t13; + R[12] = t15; + R[16] = t1; + R[18] = t4; + R[24] = 1.0f; + R[30] = -t4; + R[32] = t1; + R[36] = -t15; + R[40] = t13; + R[42] = -t12; + R[48] = t8; + SimpMatMul(7,R,input+9,result+9); + if (order > 4) + { + for(int j=0;j<=9*9;j++) R[j] = 0.0f; + const float t1 = ca; + const float t2 = t1*t1; + const float t3 = t2*t2; + const float t4 = sa; + const float t5 = t4*t4; + const float t6 = t5*t5; + const float t9 = t3+t6-6.0f*t5*t2; + const float t10 = t5*t4; + const float t12 = t2*t1; + const float t14 = -t10*t1+t4*t12; + const float t17 = t12-3.0f*t1*t5; + const float t20 = 3.0f*t4*t2-t10; + const float t21 = -t5+t2; + const float t23 = 2.0f*t1*t4; + R[0] = t9; + R[8] = 4.0f*t14; + R[10] = t17; + R[16] = t20; + R[20] = t21; + R[24] = t23; + R[30] = t1; + R[32] = t4; + R[40] = 1.0f; + R[48] = -t4; + R[50] = t1; + R[56] = -t23; + R[60] = t21; + R[64] = -t20; + R[70] = t17; + R[72] = -4.0f*t14; + R[80] = t9; + + SimpMatMul(9,R,input+16,result+16); + if (order > 5) + { + for(int j=0;j<11*11;j++) R[j] = 0.0f; + const float t1 = ca; + const float t2 = sa; + const float t3 = t2*t2; + const float t4 = t3*t3; + const float t7 = t1*t1; + const float t8 = t7*t1; + const float t11 = t7*t7; + const float t13 = 5.0f*t1*t4-10.0f*t3*t8+t11*t1; + const float t14 = t3*t2; + const float t20 = -10.0f*t14*t7+5.0f*t2*t11+t4*t2; + const float t23 = t11+t4-6.0f*t3*t7; + const float t26 = -t14*t1+t2*t8; + const float t29 = t8-3.0f*t1*t3; + const float t32 = 3.0f*t2*t7-t14; + const float t33 = -t3+t7; + const float t35 = 2.0f*t1*t2; + R[0] = t13; + R[10] = t20; + R[12] = t23; + R[20] = 4.0f*t26; + R[24] = t29; + R[30] = t32; + R[36] = t33; + R[40] = t35; + R[48] = t1; + R[50] = t2; + R[60] = 1.0f; + R[70] = -t2; + R[72] = t1; + R[80] = -t35; + R[84] = t33; + R[90] = -t32; + R[96] = t29; + R[100] = -4.0f*t26; + R[108] = t23; + R[110] = -t20; + R[120] = t13; + SimpMatMul(11,R,input+25,result+25); + } + } + } + } + + return result; +} + + +//------------------------------------------------------------------------------------- +// Adds two SH vectors, result[i] = inputA[i] + inputB[i]; +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205438.aspx +//------------------------------------------------------------------------------------- +float* XMSHAdd( _Out_writes_(order*order) float *result, + _In_ size_t order, + _In_reads_(order*order) const float *inputA, + _In_reads_(order*order) const float *inputB ) +{ + if ( !result || !inputA || !inputB ) + return nullptr; + + const size_t numcoeff = order*order; + + for( size_t i=0; i < numcoeff; ++i ) + { + result[i] = inputA[i] + inputB[i]; + } + + return result; +} + + +//------------------------------------------------------------------------------------- +// Scales a SH vector, result[i] = input[i] * scale; +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204994.aspx +//------------------------------------------------------------------------------------- +float* XMSHScale( _Out_writes_(order*order) float *result, + _In_ size_t order, + _In_reads_(order*order) const float *input, + _In_ float scale ) +{ + if ( !result || !input ) + return nullptr; + + const size_t numcoeff = order*order; + + for( size_t i=0; i < numcoeff; ++i ) + { + result[i] = scale * input[i]; + } + + return result; +} + + +//------------------------------------------------------------------------------------- +// Computes the dot product of two SH vectors +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205446.aspx +//------------------------------------------------------------------------------------- +float XMSHDot( _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB ) +{ + if ( !inputA || !inputB ) + return 0.f; + + float result = inputA[0] * inputB[0]; + + const size_t numcoeff = order*order; + + for( size_t i=1; i < numcoeff; ++i ) + { + result += inputA[i] * inputB[i]; + } + + return result; +} + + +//------------------------------------------------------------------------------------- +// Computes the product of two functions represented using SH (f and g), where: +// result[i] = int(y_i(s) * f(s) * g(s)), where y_i(s) is the ith SH basis +// function, f(s) and g(s) are SH functions (sum_i(y_i(s)*c_i)). The order O +// determines the lengths of the arrays, where there should always be O^2 +// coefficients. In general the product of two SH functions of order O generates +// and SH function of order 2*O - 1, but we truncate the result. This means +// that the product commutes (f*g == g*f) but doesn't associate +// (f*(g*h) != (f*g)*h. +//------------------------------------------------------------------------------------- +float* XMSHMultiply( _Out_writes_(order*order) float *result, + _In_ size_t order, + _In_reads_(order*order) const float *inputF, + _In_reads_(order*order) const float *inputG ) +{ + switch( order ) + { + case 2: + return XMSHMultiply2( result, inputF, inputG ); + + case 3: + return XMSHMultiply3( result, inputF, inputG ); + + case 4: + return XMSHMultiply4( result, inputF, inputG ); + + case 5: + return XMSHMultiply5( result, inputF, inputG ); + + case 6: + return XMSHMultiply6( result, inputF, inputG ); + + default: + assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ); + return nullptr; + } +} + + +//------------------------------------------------------------------------------------- +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205454.aspx +//------------------------------------------------------------------------------------- +float* XMSHMultiply2( _Out_writes_(4) float *y, + _In_reads_(4) const float *f, + _In_reads_(4) const float *g ) +{ + if ( !y || !f || !g ) + return nullptr; + + REAL tf,tg,t; + // [0,0]: 0, + y[0] = CONSTANT(0.282094792935999980)*f[0]*g[0]; + + // [1,1]: 0, + tf = CONSTANT(0.282094791773000010)*f[0]; + tg = CONSTANT(0.282094791773000010)*g[0]; + y[1] = tf*g[1]+tg*f[1]; + t = f[1]*g[1]; + y[0] += CONSTANT(0.282094791773000010)*t; + + // [2,2]: 0, + tf = CONSTANT(0.282094795249000000)*f[0]; + tg = CONSTANT(0.282094795249000000)*g[0]; + y[2] = tf*g[2]+tg*f[2]; + t = f[2]*g[2]; + y[0] += CONSTANT(0.282094795249000000)*t; + + // [3,3]: 0, + tf = CONSTANT(0.282094791773000010)*f[0]; + tg = CONSTANT(0.282094791773000010)*g[0]; + y[3] = tf*g[3]+tg*f[3]; + t = f[3]*g[3]; + y[0] += CONSTANT(0.282094791773000010)*t; + + // multiply count=20 + + return y; +} + + +//------------------------------------------------------------------------------------- +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232906.aspx +//------------------------------------------------------------------------------------- +float* XMSHMultiply3( _Out_writes_(9) float *y, + _In_reads_(9) const float *f, + _In_reads_(9) const float *g ) +{ + if ( !y || !f || !g ) + return nullptr; + + REAL tf,tg,t; + // [0,0]: 0, + y[0] = CONSTANT(0.282094792935999980)*f[0]*g[0]; + + // [1,1]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8]; + y[1] = tf*g[1]+tg*f[1]; + t = f[1]*g[1]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] = CONSTANT(-0.126156626101000010)*t; + y[8] = CONSTANT(-0.218509686119999990)*t; + + // [1,2]: 5, + tf = CONSTANT(0.218509686118000010)*f[5]; + tg = CONSTANT(0.218509686118000010)*g[5]; + y[1] += tf*g[2]+tg*f[2]; + y[2] = tf*g[1]+tg*f[1]; + t = f[1]*g[2]+f[2]*g[1]; + y[5] = CONSTANT(0.218509686118000010)*t; + + // [1,3]: 4, + tf = CONSTANT(0.218509686114999990)*f[4]; + tg = CONSTANT(0.218509686114999990)*g[4]; + y[1] += tf*g[3]+tg*f[3]; + y[3] = tf*g[1]+tg*f[1]; + t = f[1]*g[3]+f[3]*g[1]; + y[4] = CONSTANT(0.218509686114999990)*t; + + // [2,2]: 0,6, + tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6]; + tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6]; + y[2] += tf*g[2]+tg*f[2]; + t = f[2]*g[2]; + y[0] += CONSTANT(0.282094795249000000)*t; + y[6] += CONSTANT(0.252313259986999990)*t; + + // [2,3]: 7, + tf = CONSTANT(0.218509686118000010)*f[7]; + tg = CONSTANT(0.218509686118000010)*g[7]; + y[2] += tf*g[3]+tg*f[3]; + y[3] += tf*g[2]+tg*f[2]; + t = f[2]*g[3]+f[3]*g[2]; + y[7] = CONSTANT(0.218509686118000010)*t; + + // [3,3]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8]; + y[3] += tf*g[3]+tg*f[3]; + t = f[3]*g[3]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] += CONSTANT(-0.126156626101000010)*t; + y[8] += CONSTANT(0.218509686119999990)*t; + + // [4,4]: 0,6, + tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]; + tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]; + y[4] += tf*g[4]+tg*f[4]; + t = f[4]*g[4]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + + // [4,5]: 7, + tf = CONSTANT(0.156078347226000000)*f[7]; + tg = CONSTANT(0.156078347226000000)*g[7]; + y[4] += tf*g[5]+tg*f[5]; + y[5] += tf*g[4]+tg*f[4]; + t = f[4]*g[5]+f[5]*g[4]; + y[7] += CONSTANT(0.156078347226000000)*t; + + // [5,5]: 0,6,8, + tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8]; + tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8]; + y[5] += tf*g[5]+tg*f[5]; + t = f[5]*g[5]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.090111875786499998)*t; + y[8] += CONSTANT(-0.156078347227999990)*t; + + // [6,6]: 0,6, + tf = CONSTANT(0.282094797560000000)*f[0]; + tg = CONSTANT(0.282094797560000000)*g[0]; + y[6] += tf*g[6]+tg*f[6]; + t = f[6]*g[6]; + y[0] += CONSTANT(0.282094797560000000)*t; + y[6] += CONSTANT(0.180223764527000010)*t; + + // [7,7]: 0,6,8, + tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.156078347227999990)*f[8]; + tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.156078347227999990)*g[8]; + y[7] += tf*g[7]+tg*f[7]; + t = f[7]*g[7]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.090111875786499998)*t; + y[8] += CONSTANT(0.156078347227999990)*t; + + // [8,8]: 0,6, + tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]; + tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]; + y[8] += tf*g[8]+tg*f[8]; + t = f[8]*g[8]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + + // multiply count=120 + + return y; +} + + +//------------------------------------------------------------------------------------- +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232907.aspx +//------------------------------------------------------------------------------------- +float* XMSHMultiply4( _Out_writes_(16) float *y, + _In_reads_(16) const float *f, + _In_reads_(16) const float *g ) +{ + if ( !y || !f || !g ) + return nullptr; + + REAL tf,tg,t; + // [0,0]: 0, + y[0] = CONSTANT(0.282094792935999980)*f[0]*g[0]; + + // [1,1]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8]; + y[1] = tf*g[1]+tg*f[1]; + t = f[1]*g[1]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] = CONSTANT(-0.126156626101000010)*t; + y[8] = CONSTANT(-0.218509686119999990)*t; + + // [1,4]: 3,13,15, + tf = CONSTANT(0.218509686114999990)*f[3]+CONSTANT(-0.058399170082300000)*f[13]+CONSTANT(-0.226179013157999990)*f[15]; + tg = CONSTANT(0.218509686114999990)*g[3]+CONSTANT(-0.058399170082300000)*g[13]+CONSTANT(-0.226179013157999990)*g[15]; + y[1] += tf*g[4]+tg*f[4]; + y[4] = tf*g[1]+tg*f[1]; + t = f[1]*g[4]+f[4]*g[1]; + y[3] = CONSTANT(0.218509686114999990)*t; + y[13] = CONSTANT(-0.058399170082300000)*t; + y[15] = CONSTANT(-0.226179013157999990)*t; + + // [1,5]: 2,12,14, + tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(-0.184674390923000000)*f[14]; + tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(-0.184674390923000000)*g[14]; + y[1] += tf*g[5]+tg*f[5]; + y[5] = tf*g[1]+tg*f[1]; + t = f[1]*g[5]+f[5]*g[1]; + y[2] = CONSTANT(0.218509686118000010)*t; + y[12] = CONSTANT(-0.143048168103000000)*t; + y[14] = CONSTANT(-0.184674390923000000)*t; + + // [1,6]: 11, + tf = CONSTANT(0.202300659402999990)*f[11]; + tg = CONSTANT(0.202300659402999990)*g[11]; + y[1] += tf*g[6]+tg*f[6]; + y[6] += tf*g[1]+tg*f[1]; + t = f[1]*g[6]+f[6]*g[1]; + y[11] = CONSTANT(0.202300659402999990)*t; + + // [1,8]: 9,11, + tf = CONSTANT(0.226179013155000000)*f[9]+CONSTANT(0.058399170081799998)*f[11]; + tg = CONSTANT(0.226179013155000000)*g[9]+CONSTANT(0.058399170081799998)*g[11]; + y[1] += tf*g[8]+tg*f[8]; + y[8] += tf*g[1]+tg*f[1]; + t = f[1]*g[8]+f[8]*g[1]; + y[9] = CONSTANT(0.226179013155000000)*t; + y[11] += CONSTANT(0.058399170081799998)*t; + + // [2,2]: 0,6, + tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6]; + tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6]; + y[2] += tf*g[2]+tg*f[2]; + t = f[2]*g[2]; + y[0] += CONSTANT(0.282094795249000000)*t; + y[6] += CONSTANT(0.252313259986999990)*t; + + // [2,6]: 12, + tf = CONSTANT(0.247766706973999990)*f[12]; + tg = CONSTANT(0.247766706973999990)*g[12]; + y[2] += tf*g[6]+tg*f[6]; + y[6] += tf*g[2]+tg*f[2]; + t = f[2]*g[6]+f[6]*g[2]; + y[12] += CONSTANT(0.247766706973999990)*t; + + // [3,3]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8]; + y[3] += tf*g[3]+tg*f[3]; + t = f[3]*g[3]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] += CONSTANT(-0.126156626101000010)*t; + y[8] += CONSTANT(0.218509686119999990)*t; + + // [3,6]: 13, + tf = CONSTANT(0.202300659402999990)*f[13]; + tg = CONSTANT(0.202300659402999990)*g[13]; + y[3] += tf*g[6]+tg*f[6]; + y[6] += tf*g[3]+tg*f[3]; + t = f[3]*g[6]+f[6]*g[3]; + y[13] += CONSTANT(0.202300659402999990)*t; + + // [3,7]: 2,12,14, + tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(0.184674390923000000)*f[14]; + tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(0.184674390923000000)*g[14]; + y[3] += tf*g[7]+tg*f[7]; + y[7] = tf*g[3]+tg*f[3]; + t = f[3]*g[7]+f[7]*g[3]; + y[2] += CONSTANT(0.218509686118000010)*t; + y[12] += CONSTANT(-0.143048168103000000)*t; + y[14] += CONSTANT(0.184674390923000000)*t; + + // [3,8]: 13,15, + tf = CONSTANT(-0.058399170081799998)*f[13]+CONSTANT(0.226179013155000000)*f[15]; + tg = CONSTANT(-0.058399170081799998)*g[13]+CONSTANT(0.226179013155000000)*g[15]; + y[3] += tf*g[8]+tg*f[8]; + y[8] += tf*g[3]+tg*f[3]; + t = f[3]*g[8]+f[8]*g[3]; + y[13] += CONSTANT(-0.058399170081799998)*t; + y[15] += CONSTANT(0.226179013155000000)*t; + + // [4,4]: 0,6, + tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]; + tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]; + y[4] += tf*g[4]+tg*f[4]; + t = f[4]*g[4]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + + // [4,5]: 7, + tf = CONSTANT(0.156078347226000000)*f[7]; + tg = CONSTANT(0.156078347226000000)*g[7]; + y[4] += tf*g[5]+tg*f[5]; + y[5] += tf*g[4]+tg*f[4]; + t = f[4]*g[5]+f[5]*g[4]; + y[7] += CONSTANT(0.156078347226000000)*t; + + // [4,9]: 3,13, + tf = CONSTANT(0.226179013157999990)*f[3]+CONSTANT(-0.094031597258400004)*f[13]; + tg = CONSTANT(0.226179013157999990)*g[3]+CONSTANT(-0.094031597258400004)*g[13]; + y[4] += tf*g[9]+tg*f[9]; + y[9] += tf*g[4]+tg*f[4]; + t = f[4]*g[9]+f[9]*g[4]; + y[3] += CONSTANT(0.226179013157999990)*t; + y[13] += CONSTANT(-0.094031597258400004)*t; + + // [4,10]: 2,12, + tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12]; + tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12]; + y[4] += tf*g[10]+tg*f[10]; + y[10] = tf*g[4]+tg*f[4]; + t = f[4]*g[10]+f[10]*g[4]; + y[2] += CONSTANT(0.184674390919999990)*t; + y[12] += CONSTANT(-0.188063194517999990)*t; + + // [4,11]: 3,13,15, + tf = CONSTANT(-0.058399170082300000)*f[3]+CONSTANT(0.145673124078000010)*f[13]+CONSTANT(0.094031597258400004)*f[15]; + tg = CONSTANT(-0.058399170082300000)*g[3]+CONSTANT(0.145673124078000010)*g[13]+CONSTANT(0.094031597258400004)*g[15]; + y[4] += tf*g[11]+tg*f[11]; + y[11] += tf*g[4]+tg*f[4]; + t = f[4]*g[11]+f[11]*g[4]; + y[3] += CONSTANT(-0.058399170082300000)*t; + y[13] += CONSTANT(0.145673124078000010)*t; + y[15] += CONSTANT(0.094031597258400004)*t; + + // [5,5]: 0,6,8, + tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8]; + tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8]; + y[5] += tf*g[5]+tg*f[5]; + t = f[5]*g[5]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.090111875786499998)*t; + y[8] += CONSTANT(-0.156078347227999990)*t; + + // [5,9]: 14, + tf = CONSTANT(0.148677009677999990)*f[14]; + tg = CONSTANT(0.148677009677999990)*g[14]; + y[5] += tf*g[9]+tg*f[9]; + y[9] += tf*g[5]+tg*f[5]; + t = f[5]*g[9]+f[9]*g[5]; + y[14] += CONSTANT(0.148677009677999990)*t; + + // [5,10]: 3,13,15, + tf = CONSTANT(0.184674390919999990)*f[3]+CONSTANT(0.115164716490000000)*f[13]+CONSTANT(-0.148677009678999990)*f[15]; + tg = CONSTANT(0.184674390919999990)*g[3]+CONSTANT(0.115164716490000000)*g[13]+CONSTANT(-0.148677009678999990)*g[15]; + y[5] += tf*g[10]+tg*f[10]; + y[10] += tf*g[5]+tg*f[5]; + t = f[5]*g[10]+f[10]*g[5]; + y[3] += CONSTANT(0.184674390919999990)*t; + y[13] += CONSTANT(0.115164716490000000)*t; + y[15] += CONSTANT(-0.148677009678999990)*t; + + // [5,11]: 2,12,14, + tf = CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.059470803871800003)*f[12]+CONSTANT(-0.115164716491000000)*f[14]; + tg = CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.059470803871800003)*g[12]+CONSTANT(-0.115164716491000000)*g[14]; + y[5] += tf*g[11]+tg*f[11]; + y[11] += tf*g[5]+tg*f[5]; + t = f[5]*g[11]+f[11]*g[5]; + y[2] += CONSTANT(0.233596680327000010)*t; + y[12] += CONSTANT(0.059470803871800003)*t; + y[14] += CONSTANT(-0.115164716491000000)*t; + + // [6,6]: 0,6, + tf = CONSTANT(0.282094797560000000)*f[0]; + tg = CONSTANT(0.282094797560000000)*g[0]; + y[6] += tf*g[6]+tg*f[6]; + t = f[6]*g[6]; + y[0] += CONSTANT(0.282094797560000000)*t; + y[6] += CONSTANT(0.180223764527000010)*t; + + // [7,7]: 6,0,8, + tf = CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.156078347227999990)*f[8]; + tg = CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.156078347227999990)*g[8]; + y[7] += tf*g[7]+tg*f[7]; + t = f[7]*g[7]; + y[6] += CONSTANT(0.090111875786499998)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.156078347227999990)*t; + + // [7,10]: 9,1,11, + tf = CONSTANT(0.148677009678999990)*f[9]+CONSTANT(0.184674390919999990)*f[1]+CONSTANT(0.115164716490000000)*f[11]; + tg = CONSTANT(0.148677009678999990)*g[9]+CONSTANT(0.184674390919999990)*g[1]+CONSTANT(0.115164716490000000)*g[11]; + y[7] += tf*g[10]+tg*f[10]; + y[10] += tf*g[7]+tg*f[7]; + t = f[7]*g[10]+f[10]*g[7]; + y[9] += CONSTANT(0.148677009678999990)*t; + y[1] += CONSTANT(0.184674390919999990)*t; + y[11] += CONSTANT(0.115164716490000000)*t; + + // [7,13]: 12,2,14, + tf = CONSTANT(0.059470803871800003)*f[12]+CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.115164716491000000)*f[14]; + tg = CONSTANT(0.059470803871800003)*g[12]+CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.115164716491000000)*g[14]; + y[7] += tf*g[13]+tg*f[13]; + y[13] += tf*g[7]+tg*f[7]; + t = f[7]*g[13]+f[13]*g[7]; + y[12] += CONSTANT(0.059470803871800003)*t; + y[2] += CONSTANT(0.233596680327000010)*t; + y[14] += CONSTANT(0.115164716491000000)*t; + + // [7,14]: 15, + tf = CONSTANT(0.148677009677999990)*f[15]; + tg = CONSTANT(0.148677009677999990)*g[15]; + y[7] += tf*g[14]+tg*f[14]; + y[14] += tf*g[7]+tg*f[7]; + t = f[7]*g[14]+f[14]*g[7]; + y[15] += CONSTANT(0.148677009677999990)*t; + + // [8,8]: 0,6, + tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]; + tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]; + y[8] += tf*g[8]+tg*f[8]; + t = f[8]*g[8]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + + // [8,9]: 11, + tf = CONSTANT(-0.094031597259499999)*f[11]; + tg = CONSTANT(-0.094031597259499999)*g[11]; + y[8] += tf*g[9]+tg*f[9]; + y[9] += tf*g[8]+tg*f[8]; + t = f[8]*g[9]+f[9]*g[8]; + y[11] += CONSTANT(-0.094031597259499999)*t; + + // [8,13]: 15, + tf = CONSTANT(-0.094031597259499999)*f[15]; + tg = CONSTANT(-0.094031597259499999)*g[15]; + y[8] += tf*g[13]+tg*f[13]; + y[13] += tf*g[8]+tg*f[8]; + t = f[8]*g[13]+f[13]*g[8]; + y[15] += CONSTANT(-0.094031597259499999)*t; + + // [8,14]: 2,12, + tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12]; + tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12]; + y[8] += tf*g[14]+tg*f[14]; + y[14] += tf*g[8]+tg*f[8]; + t = f[8]*g[14]+f[14]*g[8]; + y[2] += CONSTANT(0.184674390919999990)*t; + y[12] += CONSTANT(-0.188063194517999990)*t; + + // [9,9]: 6,0, + tf = CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.282094791766999970)*f[0]; + tg = CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.282094791766999970)*g[0]; + y[9] += tf*g[9]+tg*f[9]; + t = f[9]*g[9]; + y[6] += CONSTANT(-0.210261043508000010)*t; + y[0] += CONSTANT(0.282094791766999970)*t; + + // [10,10]: 0, + tf = CONSTANT(0.282094791771999980)*f[0]; + tg = CONSTANT(0.282094791771999980)*g[0]; + y[10] += tf*g[10]+tg*f[10]; + t = f[10]*g[10]; + y[0] += CONSTANT(0.282094791771999980)*t; + + // [11,11]: 0,6,8, + tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(-0.145673124078999990)*f[8]; + tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(-0.145673124078999990)*g[8]; + y[11] += tf*g[11]+tg*f[11]; + t = f[11]*g[11]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.126156626101000010)*t; + y[8] += CONSTANT(-0.145673124078999990)*t; + + // [12,12]: 0,6, + tf = CONSTANT(0.282094799871999980)*f[0]+CONSTANT(0.168208852954000010)*f[6]; + tg = CONSTANT(0.282094799871999980)*g[0]+CONSTANT(0.168208852954000010)*g[6]; + y[12] += tf*g[12]+tg*f[12]; + t = f[12]*g[12]; + y[0] += CONSTANT(0.282094799871999980)*t; + y[6] += CONSTANT(0.168208852954000010)*t; + + // [13,13]: 0,8,6, + tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.145673124078999990)*f[8]+CONSTANT(0.126156626101000010)*f[6]; + tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.145673124078999990)*g[8]+CONSTANT(0.126156626101000010)*g[6]; + y[13] += tf*g[13]+tg*f[13]; + t = f[13]*g[13]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.145673124078999990)*t; + y[6] += CONSTANT(0.126156626101000010)*t; + + // [14,14]: 0, + tf = CONSTANT(0.282094791771999980)*f[0]; + tg = CONSTANT(0.282094791771999980)*g[0]; + y[14] += tf*g[14]+tg*f[14]; + t = f[14]*g[14]; + y[0] += CONSTANT(0.282094791771999980)*t; + + // [15,15]: 0,6, + tf = CONSTANT(0.282094791766999970)*f[0]+CONSTANT(-0.210261043508000010)*f[6]; + tg = CONSTANT(0.282094791766999970)*g[0]+CONSTANT(-0.210261043508000010)*g[6]; + y[15] += tf*g[15]+tg*f[15]; + t = f[15]*g[15]; + y[0] += CONSTANT(0.282094791766999970)*t; + y[6] += CONSTANT(-0.210261043508000010)*t; + + // multiply count=399 + + return y; +} + + +//------------------------------------------------------------------------------------- +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232908.aspx +//------------------------------------------------------------------------------------- +float* XMSHMultiply5( _Out_writes_(25) float *y, + _In_reads_(25) const float *f, + _In_reads_(25) const float *g ) +{ + if ( !y || !f || !g ) + return nullptr; + + REAL tf,tg,t; + // [0,0]: 0, + y[0] = CONSTANT(0.282094792935999980)*f[0]*g[0]; + + // [1,1]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8]; + y[1] = tf*g[1]+tg*f[1]; + t = f[1]*g[1]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] = CONSTANT(-0.126156626101000010)*t; + y[8] = CONSTANT(-0.218509686119999990)*t; + + // [1,4]: 3,13,15, + tf = CONSTANT(0.218509686114999990)*f[3]+CONSTANT(-0.058399170082300000)*f[13]+CONSTANT(-0.226179013157999990)*f[15]; + tg = CONSTANT(0.218509686114999990)*g[3]+CONSTANT(-0.058399170082300000)*g[13]+CONSTANT(-0.226179013157999990)*g[15]; + y[1] += tf*g[4]+tg*f[4]; + y[4] = tf*g[1]+tg*f[1]; + t = f[1]*g[4]+f[4]*g[1]; + y[3] = CONSTANT(0.218509686114999990)*t; + y[13] = CONSTANT(-0.058399170082300000)*t; + y[15] = CONSTANT(-0.226179013157999990)*t; + + // [1,5]: 2,12,14, + tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(-0.184674390923000000)*f[14]; + tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(-0.184674390923000000)*g[14]; + y[1] += tf*g[5]+tg*f[5]; + y[5] = tf*g[1]+tg*f[1]; + t = f[1]*g[5]+f[5]*g[1]; + y[2] = CONSTANT(0.218509686118000010)*t; + y[12] = CONSTANT(-0.143048168103000000)*t; + y[14] = CONSTANT(-0.184674390923000000)*t; + + // [1,9]: 8,22,24, + tf = CONSTANT(0.226179013155000000)*f[8]+CONSTANT(-0.043528171378199997)*f[22]+CONSTANT(-0.230329432978999990)*f[24]; + tg = CONSTANT(0.226179013155000000)*g[8]+CONSTANT(-0.043528171378199997)*g[22]+CONSTANT(-0.230329432978999990)*g[24]; + y[1] += tf*g[9]+tg*f[9]; + y[9] = tf*g[1]+tg*f[1]; + t = f[1]*g[9]+f[9]*g[1]; + y[8] += CONSTANT(0.226179013155000000)*t; + y[22] = CONSTANT(-0.043528171378199997)*t; + y[24] = CONSTANT(-0.230329432978999990)*t; + + // [1,10]: 7,21,23, + tf = CONSTANT(0.184674390919999990)*f[7]+CONSTANT(-0.075393004386799994)*f[21]+CONSTANT(-0.199471140200000010)*f[23]; + tg = CONSTANT(0.184674390919999990)*g[7]+CONSTANT(-0.075393004386799994)*g[21]+CONSTANT(-0.199471140200000010)*g[23]; + y[1] += tf*g[10]+tg*f[10]; + y[10] = tf*g[1]+tg*f[1]; + t = f[1]*g[10]+f[10]*g[1]; + y[7] = CONSTANT(0.184674390919999990)*t; + y[21] = CONSTANT(-0.075393004386799994)*t; + y[23] = CONSTANT(-0.199471140200000010)*t; + + // [1,11]: 6,8,20,22, + tf = CONSTANT(0.202300659402999990)*f[6]+CONSTANT(0.058399170081799998)*f[8]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(-0.168583882836999990)*f[22]; + tg = CONSTANT(0.202300659402999990)*g[6]+CONSTANT(0.058399170081799998)*g[8]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(-0.168583882836999990)*g[22]; + y[1] += tf*g[11]+tg*f[11]; + y[11] = tf*g[1]+tg*f[1]; + t = f[1]*g[11]+f[11]*g[1]; + y[6] += CONSTANT(0.202300659402999990)*t; + y[8] += CONSTANT(0.058399170081799998)*t; + y[20] = CONSTANT(-0.150786008773000000)*t; + y[22] += CONSTANT(-0.168583882836999990)*t; + + // [1,12]: 19, + tf = CONSTANT(0.194663900273000010)*f[19]; + tg = CONSTANT(0.194663900273000010)*g[19]; + y[1] += tf*g[12]+tg*f[12]; + y[12] += tf*g[1]+tg*f[1]; + t = f[1]*g[12]+f[12]*g[1]; + y[19] = CONSTANT(0.194663900273000010)*t; + + // [1,13]: 18, + tf = CONSTANT(0.168583882834000000)*f[18]; + tg = CONSTANT(0.168583882834000000)*g[18]; + y[1] += tf*g[13]+tg*f[13]; + y[13] += tf*g[1]+tg*f[1]; + t = f[1]*g[13]+f[13]*g[1]; + y[18] = CONSTANT(0.168583882834000000)*t; + + // [1,14]: 17,19, + tf = CONSTANT(0.199471140196999990)*f[17]+CONSTANT(0.075393004386399995)*f[19]; + tg = CONSTANT(0.199471140196999990)*g[17]+CONSTANT(0.075393004386399995)*g[19]; + y[1] += tf*g[14]+tg*f[14]; + y[14] += tf*g[1]+tg*f[1]; + t = f[1]*g[14]+f[14]*g[1]; + y[17] = CONSTANT(0.199471140196999990)*t; + y[19] += CONSTANT(0.075393004386399995)*t; + + // [1,15]: 16,18, + tf = CONSTANT(0.230329432973999990)*f[16]+CONSTANT(0.043528171377799997)*f[18]; + tg = CONSTANT(0.230329432973999990)*g[16]+CONSTANT(0.043528171377799997)*g[18]; + y[1] += tf*g[15]+tg*f[15]; + y[15] += tf*g[1]+tg*f[1]; + t = f[1]*g[15]+f[15]*g[1]; + y[16] = CONSTANT(0.230329432973999990)*t; + y[18] += CONSTANT(0.043528171377799997)*t; + + // [2,2]: 0,6, + tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6]; + tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6]; + y[2] += tf*g[2]+tg*f[2]; + t = f[2]*g[2]; + y[0] += CONSTANT(0.282094795249000000)*t; + y[6] += CONSTANT(0.252313259986999990)*t; + + // [2,10]: 4,18, + tf = CONSTANT(0.184674390919999990)*f[4]+CONSTANT(0.213243618621000000)*f[18]; + tg = CONSTANT(0.184674390919999990)*g[4]+CONSTANT(0.213243618621000000)*g[18]; + y[2] += tf*g[10]+tg*f[10]; + y[10] += tf*g[2]+tg*f[2]; + t = f[2]*g[10]+f[10]*g[2]; + y[4] += CONSTANT(0.184674390919999990)*t; + y[18] += CONSTANT(0.213243618621000000)*t; + + // [2,12]: 6,20, + tf = CONSTANT(0.247766706973999990)*f[6]+CONSTANT(0.246232537174000010)*f[20]; + tg = CONSTANT(0.247766706973999990)*g[6]+CONSTANT(0.246232537174000010)*g[20]; + y[2] += tf*g[12]+tg*f[12]; + y[12] += tf*g[2]+tg*f[2]; + t = f[2]*g[12]+f[12]*g[2]; + y[6] += CONSTANT(0.247766706973999990)*t; + y[20] += CONSTANT(0.246232537174000010)*t; + + // [2,14]: 8,22, + tf = CONSTANT(0.184674390919999990)*f[8]+CONSTANT(0.213243618621000000)*f[22]; + tg = CONSTANT(0.184674390919999990)*g[8]+CONSTANT(0.213243618621000000)*g[22]; + y[2] += tf*g[14]+tg*f[14]; + y[14] += tf*g[2]+tg*f[2]; + t = f[2]*g[14]+f[14]*g[2]; + y[8] += CONSTANT(0.184674390919999990)*t; + y[22] += CONSTANT(0.213243618621000000)*t; + + // [3,3]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8]; + y[3] += tf*g[3]+tg*f[3]; + t = f[3]*g[3]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] += CONSTANT(-0.126156626101000010)*t; + y[8] += CONSTANT(0.218509686119999990)*t; + + // [3,7]: 2,12,14, + tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]+CONSTANT(0.184674390923000000)*f[14]; + tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]+CONSTANT(0.184674390923000000)*g[14]; + y[3] += tf*g[7]+tg*f[7]; + y[7] += tf*g[3]+tg*f[3]; + t = f[3]*g[7]+f[7]*g[3]; + y[2] += CONSTANT(0.218509686118000010)*t; + y[12] += CONSTANT(-0.143048168103000000)*t; + y[14] += CONSTANT(0.184674390923000000)*t; + + // [3,9]: 4,16,18, + tf = CONSTANT(0.226179013157999990)*f[4]+CONSTANT(0.230329432973999990)*f[16]+CONSTANT(-0.043528171377799997)*f[18]; + tg = CONSTANT(0.226179013157999990)*g[4]+CONSTANT(0.230329432973999990)*g[16]+CONSTANT(-0.043528171377799997)*g[18]; + y[3] += tf*g[9]+tg*f[9]; + y[9] += tf*g[3]+tg*f[3]; + t = f[3]*g[9]+f[9]*g[3]; + y[4] += CONSTANT(0.226179013157999990)*t; + y[16] += CONSTANT(0.230329432973999990)*t; + y[18] += CONSTANT(-0.043528171377799997)*t; + + // [3,10]: 5,17,19, + tf = CONSTANT(0.184674390919999990)*f[5]+CONSTANT(0.199471140200000010)*f[17]+CONSTANT(-0.075393004386799994)*f[19]; + tg = CONSTANT(0.184674390919999990)*g[5]+CONSTANT(0.199471140200000010)*g[17]+CONSTANT(-0.075393004386799994)*g[19]; + y[3] += tf*g[10]+tg*f[10]; + y[10] += tf*g[3]+tg*f[3]; + t = f[3]*g[10]+f[10]*g[3]; + y[5] += CONSTANT(0.184674390919999990)*t; + y[17] += CONSTANT(0.199471140200000010)*t; + y[19] += CONSTANT(-0.075393004386799994)*t; + + // [3,12]: 21, + tf = CONSTANT(0.194663900273000010)*f[21]; + tg = CONSTANT(0.194663900273000010)*g[21]; + y[3] += tf*g[12]+tg*f[12]; + y[12] += tf*g[3]+tg*f[3]; + t = f[3]*g[12]+f[12]*g[3]; + y[21] += CONSTANT(0.194663900273000010)*t; + + // [3,13]: 8,6,20,22, + tf = CONSTANT(-0.058399170081799998)*f[8]+CONSTANT(0.202300659402999990)*f[6]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(0.168583882836999990)*f[22]; + tg = CONSTANT(-0.058399170081799998)*g[8]+CONSTANT(0.202300659402999990)*g[6]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(0.168583882836999990)*g[22]; + y[3] += tf*g[13]+tg*f[13]; + y[13] += tf*g[3]+tg*f[3]; + t = f[3]*g[13]+f[13]*g[3]; + y[8] += CONSTANT(-0.058399170081799998)*t; + y[6] += CONSTANT(0.202300659402999990)*t; + y[20] += CONSTANT(-0.150786008773000000)*t; + y[22] += CONSTANT(0.168583882836999990)*t; + + // [3,14]: 21,23, + tf = CONSTANT(-0.075393004386399995)*f[21]+CONSTANT(0.199471140196999990)*f[23]; + tg = CONSTANT(-0.075393004386399995)*g[21]+CONSTANT(0.199471140196999990)*g[23]; + y[3] += tf*g[14]+tg*f[14]; + y[14] += tf*g[3]+tg*f[3]; + t = f[3]*g[14]+f[14]*g[3]; + y[21] += CONSTANT(-0.075393004386399995)*t; + y[23] += CONSTANT(0.199471140196999990)*t; + + // [3,15]: 8,22,24, + tf = CONSTANT(0.226179013155000000)*f[8]+CONSTANT(-0.043528171378199997)*f[22]+CONSTANT(0.230329432978999990)*f[24]; + tg = CONSTANT(0.226179013155000000)*g[8]+CONSTANT(-0.043528171378199997)*g[22]+CONSTANT(0.230329432978999990)*g[24]; + y[3] += tf*g[15]+tg*f[15]; + y[15] += tf*g[3]+tg*f[3]; + t = f[3]*g[15]+f[15]*g[3]; + y[8] += CONSTANT(0.226179013155000000)*t; + y[22] += CONSTANT(-0.043528171378199997)*t; + y[24] += CONSTANT(0.230329432978999990)*t; + + // [4,4]: 0,6,20,24, + tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(-0.238413613505999990)*f[24]; + tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(-0.238413613505999990)*g[24]; + y[4] += tf*g[4]+tg*f[4]; + t = f[4]*g[4]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + y[20] += CONSTANT(0.040299255967500003)*t; + y[24] += CONSTANT(-0.238413613505999990)*t; + + // [4,5]: 7,21,23, + tf = CONSTANT(0.156078347226000000)*f[7]+CONSTANT(-0.063718718434399996)*f[21]+CONSTANT(-0.168583882835000000)*f[23]; + tg = CONSTANT(0.156078347226000000)*g[7]+CONSTANT(-0.063718718434399996)*g[21]+CONSTANT(-0.168583882835000000)*g[23]; + y[4] += tf*g[5]+tg*f[5]; + y[5] += tf*g[4]+tg*f[4]; + t = f[4]*g[5]+f[5]*g[4]; + y[7] += CONSTANT(0.156078347226000000)*t; + y[21] += CONSTANT(-0.063718718434399996)*t; + y[23] += CONSTANT(-0.168583882835000000)*t; + + // [4,11]: 3,13,15, + tf = CONSTANT(-0.058399170082300000)*f[3]+CONSTANT(0.145673124078000010)*f[13]+CONSTANT(0.094031597258400004)*f[15]; + tg = CONSTANT(-0.058399170082300000)*g[3]+CONSTANT(0.145673124078000010)*g[13]+CONSTANT(0.094031597258400004)*g[15]; + y[4] += tf*g[11]+tg*f[11]; + y[11] += tf*g[4]+tg*f[4]; + t = f[4]*g[11]+f[11]*g[4]; + y[3] += CONSTANT(-0.058399170082300000)*t; + y[13] += CONSTANT(0.145673124078000010)*t; + y[15] += CONSTANT(0.094031597258400004)*t; + + // [4,16]: 8,22, + tf = CONSTANT(0.238413613494000000)*f[8]+CONSTANT(-0.075080816693699995)*f[22]; + tg = CONSTANT(0.238413613494000000)*g[8]+CONSTANT(-0.075080816693699995)*g[22]; + y[4] += tf*g[16]+tg*f[16]; + y[16] += tf*g[4]+tg*f[4]; + t = f[4]*g[16]+f[16]*g[4]; + y[8] += CONSTANT(0.238413613494000000)*t; + y[22] += CONSTANT(-0.075080816693699995)*t; + + // [4,18]: 6,20,24, + tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(0.075080816691500005)*f[24]; + tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(0.075080816691500005)*g[24]; + y[4] += tf*g[18]+tg*f[18]; + y[18] += tf*g[4]+tg*f[4]; + t = f[4]*g[18]+f[18]*g[4]; + y[6] += CONSTANT(0.156078347226000000)*t; + y[20] += CONSTANT(-0.190364615029000010)*t; + y[24] += CONSTANT(0.075080816691500005)*t; + + // [4,19]: 7,21,23, + tf = CONSTANT(-0.063718718434399996)*f[7]+CONSTANT(0.141889406569999990)*f[21]+CONSTANT(0.112621225039000000)*f[23]; + tg = CONSTANT(-0.063718718434399996)*g[7]+CONSTANT(0.141889406569999990)*g[21]+CONSTANT(0.112621225039000000)*g[23]; + y[4] += tf*g[19]+tg*f[19]; + y[19] += tf*g[4]+tg*f[4]; + t = f[4]*g[19]+f[19]*g[4]; + y[7] += CONSTANT(-0.063718718434399996)*t; + y[21] += CONSTANT(0.141889406569999990)*t; + y[23] += CONSTANT(0.112621225039000000)*t; + + // [5,5]: 0,6,8,20,22, + tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(-0.180223751574000000)*f[22]; + tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(-0.180223751574000000)*g[22]; + y[5] += tf*g[5]+tg*f[5]; + t = f[5]*g[5]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.090111875786499998)*t; + y[8] += CONSTANT(-0.156078347227999990)*t; + y[20] += CONSTANT(-0.161197023870999990)*t; + y[22] += CONSTANT(-0.180223751574000000)*t; + + // [5,11]: 2,12,14, + tf = CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.059470803871800003)*f[12]+CONSTANT(-0.115164716491000000)*f[14]; + tg = CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.059470803871800003)*g[12]+CONSTANT(-0.115164716491000000)*g[14]; + y[5] += tf*g[11]+tg*f[11]; + y[11] += tf*g[5]+tg*f[5]; + t = f[5]*g[11]+f[11]*g[5]; + y[2] += CONSTANT(0.233596680327000010)*t; + y[12] += CONSTANT(0.059470803871800003)*t; + y[14] += CONSTANT(-0.115164716491000000)*t; + + // [5,17]: 8,22,24, + tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(-0.140463346189000000)*f[24]; + tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(-0.140463346189000000)*g[24]; + y[5] += tf*g[17]+tg*f[17]; + y[17] += tf*g[5]+tg*f[5]; + t = f[5]*g[17]+f[17]*g[5]; + y[8] += CONSTANT(0.168583882832999990)*t; + y[22] += CONSTANT(0.132725386548000010)*t; + y[24] += CONSTANT(-0.140463346189000000)*t; + + // [5,18]: 7,21,23, + tf = CONSTANT(0.180223751571000010)*f[7]+CONSTANT(0.090297865407399994)*f[21]+CONSTANT(-0.132725386549000010)*f[23]; + tg = CONSTANT(0.180223751571000010)*g[7]+CONSTANT(0.090297865407399994)*g[21]+CONSTANT(-0.132725386549000010)*g[23]; + y[5] += tf*g[18]+tg*f[18]; + y[18] += tf*g[5]+tg*f[5]; + t = f[5]*g[18]+f[18]*g[5]; + y[7] += CONSTANT(0.180223751571000010)*t; + y[21] += CONSTANT(0.090297865407399994)*t; + y[23] += CONSTANT(-0.132725386549000010)*t; + + // [5,19]: 6,8,20,22, + tf = CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(-0.090297865408399999)*f[22]; + tg = CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(-0.090297865408399999)*g[22]; + y[5] += tf*g[19]+tg*f[19]; + y[19] += tf*g[5]+tg*f[5]; + t = f[5]*g[19]+f[19]*g[5]; + y[6] += CONSTANT(0.220728115440999990)*t; + y[8] += CONSTANT(0.063718718433900007)*t; + y[20] += CONSTANT(0.044869370061299998)*t; + y[22] += CONSTANT(-0.090297865408399999)*t; + + // [6,6]: 0,6,20, + tf = CONSTANT(0.282094797560000000)*f[0]+CONSTANT(0.241795553185999990)*f[20]; + tg = CONSTANT(0.282094797560000000)*g[0]+CONSTANT(0.241795553185999990)*g[20]; + y[6] += tf*g[6]+tg*f[6]; + t = f[6]*g[6]; + y[0] += CONSTANT(0.282094797560000000)*t; + y[6] += CONSTANT(0.180223764527000010)*t; + y[20] += CONSTANT(0.241795553185999990)*t; + + // [7,7]: 6,0,8,20,22, + tf = CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(0.180223751574000000)*f[22]; + tg = CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(0.180223751574000000)*g[22]; + y[7] += tf*g[7]+tg*f[7]; + t = f[7]*g[7]; + y[6] += CONSTANT(0.090111875786499998)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.156078347227999990)*t; + y[20] += CONSTANT(-0.161197023870999990)*t; + y[22] += CONSTANT(0.180223751574000000)*t; + + // [7,13]: 12,2,14, + tf = CONSTANT(0.059470803871800003)*f[12]+CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.115164716491000000)*f[14]; + tg = CONSTANT(0.059470803871800003)*g[12]+CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.115164716491000000)*g[14]; + y[7] += tf*g[13]+tg*f[13]; + y[13] += tf*g[7]+tg*f[7]; + t = f[7]*g[13]+f[13]*g[7]; + y[12] += CONSTANT(0.059470803871800003)*t; + y[2] += CONSTANT(0.233596680327000010)*t; + y[14] += CONSTANT(0.115164716491000000)*t; + + // [7,17]: 16,4,18, + tf = CONSTANT(0.140463346187999990)*f[16]+CONSTANT(0.168583882835000000)*f[4]+CONSTANT(0.132725386549000010)*f[18]; + tg = CONSTANT(0.140463346187999990)*g[16]+CONSTANT(0.168583882835000000)*g[4]+CONSTANT(0.132725386549000010)*g[18]; + y[7] += tf*g[17]+tg*f[17]; + y[17] += tf*g[7]+tg*f[7]; + t = f[7]*g[17]+f[17]*g[7]; + y[16] += CONSTANT(0.140463346187999990)*t; + y[4] += CONSTANT(0.168583882835000000)*t; + y[18] += CONSTANT(0.132725386549000010)*t; + + // [7,21]: 8,20,6,22, + tf = CONSTANT(-0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.090297865408399999)*f[22]; + tg = CONSTANT(-0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.090297865408399999)*g[22]; + y[7] += tf*g[21]+tg*f[21]; + y[21] += tf*g[7]+tg*f[7]; + t = f[7]*g[21]+f[21]*g[7]; + y[8] += CONSTANT(-0.063718718433900007)*t; + y[20] += CONSTANT(0.044869370061299998)*t; + y[6] += CONSTANT(0.220728115440999990)*t; + y[22] += CONSTANT(0.090297865408399999)*t; + + // [7,23]: 8,22,24, + tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(0.140463346189000000)*f[24]; + tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(0.140463346189000000)*g[24]; + y[7] += tf*g[23]+tg*f[23]; + y[23] += tf*g[7]+tg*f[7]; + t = f[7]*g[23]+f[23]*g[7]; + y[8] += CONSTANT(0.168583882832999990)*t; + y[22] += CONSTANT(0.132725386548000010)*t; + y[24] += CONSTANT(0.140463346189000000)*t; + + // [8,8]: 0,6,20,24, + tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(0.238413613505999990)*f[24]; + tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(0.238413613505999990)*g[24]; + y[8] += tf*g[8]+tg*f[8]; + t = f[8]*g[8]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + y[20] += CONSTANT(0.040299255967500003)*t; + y[24] += CONSTANT(0.238413613505999990)*t; + + // [8,22]: 6,20,24, + tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(-0.075080816691500005)*f[24]; + tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(-0.075080816691500005)*g[24]; + y[8] += tf*g[22]+tg*f[22]; + y[22] += tf*g[8]+tg*f[8]; + t = f[8]*g[22]+f[22]*g[8]; + y[6] += CONSTANT(0.156078347226000000)*t; + y[20] += CONSTANT(-0.190364615029000010)*t; + y[24] += CONSTANT(-0.075080816691500005)*t; + + // [9,9]: 6,0,20, + tf = CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.282094791766999970)*f[0]+CONSTANT(0.076934943209800002)*f[20]; + tg = CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.282094791766999970)*g[0]+CONSTANT(0.076934943209800002)*g[20]; + y[9] += tf*g[9]+tg*f[9]; + t = f[9]*g[9]; + y[6] += CONSTANT(-0.210261043508000010)*t; + y[0] += CONSTANT(0.282094791766999970)*t; + y[20] += CONSTANT(0.076934943209800002)*t; + + // [9,10]: 7,21, + tf = CONSTANT(0.148677009678999990)*f[7]+CONSTANT(-0.099322584599600000)*f[21]; + tg = CONSTANT(0.148677009678999990)*g[7]+CONSTANT(-0.099322584599600000)*g[21]; + y[9] += tf*g[10]+tg*f[10]; + y[10] += tf*g[9]+tg*f[9]; + t = f[9]*g[10]+f[10]*g[9]; + y[7] += CONSTANT(0.148677009678999990)*t; + y[21] += CONSTANT(-0.099322584599600000)*t; + + // [9,11]: 8,22,24, + tf = CONSTANT(-0.094031597259499999)*f[8]+CONSTANT(0.133255230518000010)*f[22]+CONSTANT(0.117520066950999990)*f[24]; + tg = CONSTANT(-0.094031597259499999)*g[8]+CONSTANT(0.133255230518000010)*g[22]+CONSTANT(0.117520066950999990)*g[24]; + y[9] += tf*g[11]+tg*f[11]; + y[11] += tf*g[9]+tg*f[9]; + t = f[9]*g[11]+f[11]*g[9]; + y[8] += CONSTANT(-0.094031597259499999)*t; + y[22] += CONSTANT(0.133255230518000010)*t; + y[24] += CONSTANT(0.117520066950999990)*t; + + // [9,13]: 4,16,18, + tf = CONSTANT(-0.094031597258400004)*f[4]+CONSTANT(-0.117520066953000000)*f[16]+CONSTANT(0.133255230519000010)*f[18]; + tg = CONSTANT(-0.094031597258400004)*g[4]+CONSTANT(-0.117520066953000000)*g[16]+CONSTANT(0.133255230519000010)*g[18]; + y[9] += tf*g[13]+tg*f[13]; + y[13] += tf*g[9]+tg*f[9]; + t = f[9]*g[13]+f[13]*g[9]; + y[4] += CONSTANT(-0.094031597258400004)*t; + y[16] += CONSTANT(-0.117520066953000000)*t; + y[18] += CONSTANT(0.133255230519000010)*t; + + // [9,14]: 5,19, + tf = CONSTANT(0.148677009677999990)*f[5]+CONSTANT(-0.099322584600699995)*f[19]; + tg = CONSTANT(0.148677009677999990)*g[5]+CONSTANT(-0.099322584600699995)*g[19]; + y[9] += tf*g[14]+tg*f[14]; + y[14] += tf*g[9]+tg*f[9]; + t = f[9]*g[14]+f[14]*g[9]; + y[5] += CONSTANT(0.148677009677999990)*t; + y[19] += CONSTANT(-0.099322584600699995)*t; + + // [9,17]: 2,12, + tf = CONSTANT(0.162867503964999990)*f[2]+CONSTANT(-0.203550726872999990)*f[12]; + tg = CONSTANT(0.162867503964999990)*g[2]+CONSTANT(-0.203550726872999990)*g[12]; + y[9] += tf*g[17]+tg*f[17]; + y[17] += tf*g[9]+tg*f[9]; + t = f[9]*g[17]+f[17]*g[9]; + y[2] += CONSTANT(0.162867503964999990)*t; + y[12] += CONSTANT(-0.203550726872999990)*t; + + // [10,10]: 0,20,24, + tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(-0.151717754049000010)*f[24]; + tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(-0.151717754049000010)*g[24]; + y[10] += tf*g[10]+tg*f[10]; + t = f[10]*g[10]; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.179514867494000000)*t; + y[24] += CONSTANT(-0.151717754049000010)*t; + + // [10,11]: 7,21,23, + tf = CONSTANT(0.115164716490000000)*f[7]+CONSTANT(0.102579924281000000)*f[21]+CONSTANT(-0.067850242288900006)*f[23]; + tg = CONSTANT(0.115164716490000000)*g[7]+CONSTANT(0.102579924281000000)*g[21]+CONSTANT(-0.067850242288900006)*g[23]; + y[10] += tf*g[11]+tg*f[11]; + y[11] += tf*g[10]+tg*f[10]; + t = f[10]*g[11]+f[11]*g[10]; + y[7] += CONSTANT(0.115164716490000000)*t; + y[21] += CONSTANT(0.102579924281000000)*t; + y[23] += CONSTANT(-0.067850242288900006)*t; + + // [10,12]: 4,18, + tf = CONSTANT(-0.188063194517999990)*f[4]+CONSTANT(-0.044418410173299998)*f[18]; + tg = CONSTANT(-0.188063194517999990)*g[4]+CONSTANT(-0.044418410173299998)*g[18]; + y[10] += tf*g[12]+tg*f[12]; + y[12] += tf*g[10]+tg*f[10]; + t = f[10]*g[12]+f[12]*g[10]; + y[4] += CONSTANT(-0.188063194517999990)*t; + y[18] += CONSTANT(-0.044418410173299998)*t; + + // [10,13]: 5,17,19, + tf = CONSTANT(0.115164716490000000)*f[5]+CONSTANT(0.067850242288900006)*f[17]+CONSTANT(0.102579924281000000)*f[19]; + tg = CONSTANT(0.115164716490000000)*g[5]+CONSTANT(0.067850242288900006)*g[17]+CONSTANT(0.102579924281000000)*g[19]; + y[10] += tf*g[13]+tg*f[13]; + y[13] += tf*g[10]+tg*f[10]; + t = f[10]*g[13]+f[13]*g[10]; + y[5] += CONSTANT(0.115164716490000000)*t; + y[17] += CONSTANT(0.067850242288900006)*t; + y[19] += CONSTANT(0.102579924281000000)*t; + + // [10,14]: 16, + tf = CONSTANT(0.151717754044999990)*f[16]; + tg = CONSTANT(0.151717754044999990)*g[16]; + y[10] += tf*g[14]+tg*f[14]; + y[14] += tf*g[10]+tg*f[10]; + t = f[10]*g[14]+f[14]*g[10]; + y[16] += CONSTANT(0.151717754044999990)*t; + + // [10,15]: 5,19, + tf = CONSTANT(-0.148677009678999990)*f[5]+CONSTANT(0.099322584599600000)*f[19]; + tg = CONSTANT(-0.148677009678999990)*g[5]+CONSTANT(0.099322584599600000)*g[19]; + y[10] += tf*g[15]+tg*f[15]; + y[15] += tf*g[10]+tg*f[10]; + t = f[10]*g[15]+f[15]*g[10]; + y[5] += CONSTANT(-0.148677009678999990)*t; + y[19] += CONSTANT(0.099322584599600000)*t; + + // [11,11]: 0,6,8,20,22, + tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(-0.145673124078999990)*f[8]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(-0.114687841910000000)*f[22]; + tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(-0.145673124078999990)*g[8]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(-0.114687841910000000)*g[22]; + y[11] += tf*g[11]+tg*f[11]; + t = f[11]*g[11]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.126156626101000010)*t; + y[8] += CONSTANT(-0.145673124078999990)*t; + y[20] += CONSTANT(0.025644981070299999)*t; + y[22] += CONSTANT(-0.114687841910000000)*t; + + // [11,14]: 17, + tf = CONSTANT(0.067850242288500007)*f[17]; + tg = CONSTANT(0.067850242288500007)*g[17]; + y[11] += tf*g[14]+tg*f[14]; + y[14] += tf*g[11]+tg*f[11]; + t = f[11]*g[14]+f[14]*g[11]; + y[17] += CONSTANT(0.067850242288500007)*t; + + // [11,15]: 16, + tf = CONSTANT(-0.117520066953000000)*f[16]; + tg = CONSTANT(-0.117520066953000000)*g[16]; + y[11] += tf*g[15]+tg*f[15]; + y[15] += tf*g[11]+tg*f[11]; + t = f[11]*g[15]+f[15]*g[11]; + y[16] += CONSTANT(-0.117520066953000000)*t; + + // [11,18]: 3,13,15, + tf = CONSTANT(0.168583882834000000)*f[3]+CONSTANT(0.114687841909000000)*f[13]+CONSTANT(-0.133255230519000010)*f[15]; + tg = CONSTANT(0.168583882834000000)*g[3]+CONSTANT(0.114687841909000000)*g[13]+CONSTANT(-0.133255230519000010)*g[15]; + y[11] += tf*g[18]+tg*f[18]; + y[18] += tf*g[11]+tg*f[11]; + t = f[11]*g[18]+f[18]*g[11]; + y[3] += CONSTANT(0.168583882834000000)*t; + y[13] += CONSTANT(0.114687841909000000)*t; + y[15] += CONSTANT(-0.133255230519000010)*t; + + // [11,19]: 2,14,12, + tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(-0.102579924282000000)*f[14]+CONSTANT(0.099322584599300004)*f[12]; + tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(-0.102579924282000000)*g[14]+CONSTANT(0.099322584599300004)*g[12]; + y[11] += tf*g[19]+tg*f[19]; + y[19] += tf*g[11]+tg*f[11]; + t = f[11]*g[19]+f[19]*g[11]; + y[2] += CONSTANT(0.238413613504000000)*t; + y[14] += CONSTANT(-0.102579924282000000)*t; + y[12] += CONSTANT(0.099322584599300004)*t; + + // [12,12]: 0,6,20, + tf = CONSTANT(0.282094799871999980)*f[0]+CONSTANT(0.168208852954000010)*f[6]+CONSTANT(0.153869910786000010)*f[20]; + tg = CONSTANT(0.282094799871999980)*g[0]+CONSTANT(0.168208852954000010)*g[6]+CONSTANT(0.153869910786000010)*g[20]; + y[12] += tf*g[12]+tg*f[12]; + t = f[12]*g[12]; + y[0] += CONSTANT(0.282094799871999980)*t; + y[6] += CONSTANT(0.168208852954000010)*t; + y[20] += CONSTANT(0.153869910786000010)*t; + + // [12,14]: 8,22, + tf = CONSTANT(-0.188063194517999990)*f[8]+CONSTANT(-0.044418410173299998)*f[22]; + tg = CONSTANT(-0.188063194517999990)*g[8]+CONSTANT(-0.044418410173299998)*g[22]; + y[12] += tf*g[14]+tg*f[14]; + y[14] += tf*g[12]+tg*f[12]; + t = f[12]*g[14]+f[14]*g[12]; + y[8] += CONSTANT(-0.188063194517999990)*t; + y[22] += CONSTANT(-0.044418410173299998)*t; + + // [13,13]: 0,8,6,20,22, + tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.145673124078999990)*f[8]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(0.114687841910000000)*f[22]; + tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.145673124078999990)*g[8]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(0.114687841910000000)*g[22]; + y[13] += tf*g[13]+tg*f[13]; + t = f[13]*g[13]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.145673124078999990)*t; + y[6] += CONSTANT(0.126156626101000010)*t; + y[20] += CONSTANT(0.025644981070299999)*t; + y[22] += CONSTANT(0.114687841910000000)*t; + + // [13,14]: 23, + tf = CONSTANT(0.067850242288500007)*f[23]; + tg = CONSTANT(0.067850242288500007)*g[23]; + y[13] += tf*g[14]+tg*f[14]; + y[14] += tf*g[13]+tg*f[13]; + t = f[13]*g[14]+f[14]*g[13]; + y[23] += CONSTANT(0.067850242288500007)*t; + + // [13,15]: 8,22,24, + tf = CONSTANT(-0.094031597259499999)*f[8]+CONSTANT(0.133255230518000010)*f[22]+CONSTANT(-0.117520066950999990)*f[24]; + tg = CONSTANT(-0.094031597259499999)*g[8]+CONSTANT(0.133255230518000010)*g[22]+CONSTANT(-0.117520066950999990)*g[24]; + y[13] += tf*g[15]+tg*f[15]; + y[15] += tf*g[13]+tg*f[13]; + t = f[13]*g[15]+f[15]*g[13]; + y[8] += CONSTANT(-0.094031597259499999)*t; + y[22] += CONSTANT(0.133255230518000010)*t; + y[24] += CONSTANT(-0.117520066950999990)*t; + + // [13,21]: 2,12,14, + tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(0.099322584599300004)*f[12]+CONSTANT(0.102579924282000000)*f[14]; + tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(0.099322584599300004)*g[12]+CONSTANT(0.102579924282000000)*g[14]; + y[13] += tf*g[21]+tg*f[21]; + y[21] += tf*g[13]+tg*f[13]; + t = f[13]*g[21]+f[21]*g[13]; + y[2] += CONSTANT(0.238413613504000000)*t; + y[12] += CONSTANT(0.099322584599300004)*t; + y[14] += CONSTANT(0.102579924282000000)*t; + + // [14,14]: 0,20,24, + tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(0.151717754049000010)*f[24]; + tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(0.151717754049000010)*g[24]; + y[14] += tf*g[14]+tg*f[14]; + t = f[14]*g[14]; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.179514867494000000)*t; + y[24] += CONSTANT(0.151717754049000010)*t; + + // [14,15]: 7,21, + tf = CONSTANT(0.148677009677999990)*f[7]+CONSTANT(-0.099322584600699995)*f[21]; + tg = CONSTANT(0.148677009677999990)*g[7]+CONSTANT(-0.099322584600699995)*g[21]; + y[14] += tf*g[15]+tg*f[15]; + y[15] += tf*g[14]+tg*f[14]; + t = f[14]*g[15]+f[15]*g[14]; + y[7] += CONSTANT(0.148677009677999990)*t; + y[21] += CONSTANT(-0.099322584600699995)*t; + + // [15,15]: 0,6,20, + tf = CONSTANT(0.282094791766999970)*f[0]+CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.076934943209800002)*f[20]; + tg = CONSTANT(0.282094791766999970)*g[0]+CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.076934943209800002)*g[20]; + y[15] += tf*g[15]+tg*f[15]; + t = f[15]*g[15]; + y[0] += CONSTANT(0.282094791766999970)*t; + y[6] += CONSTANT(-0.210261043508000010)*t; + y[20] += CONSTANT(0.076934943209800002)*t; + + // [15,23]: 12,2, + tf = CONSTANT(-0.203550726872999990)*f[12]+CONSTANT(0.162867503964999990)*f[2]; + tg = CONSTANT(-0.203550726872999990)*g[12]+CONSTANT(0.162867503964999990)*g[2]; + y[15] += tf*g[23]+tg*f[23]; + y[23] += tf*g[15]+tg*f[15]; + t = f[15]*g[23]+f[23]*g[15]; + y[12] += CONSTANT(-0.203550726872999990)*t; + y[2] += CONSTANT(0.162867503964999990)*t; + + // [16,16]: 0,6,20, + tf = CONSTANT(0.282094791763999990)*f[0]+CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.106525305981000000)*f[20]; + tg = CONSTANT(0.282094791763999990)*g[0]+CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.106525305981000000)*g[20]; + y[16] += tf*g[16]+tg*f[16]; + t = f[16]*g[16]; + y[0] += CONSTANT(0.282094791763999990)*t; + y[6] += CONSTANT(-0.229375683829000000)*t; + y[20] += CONSTANT(0.106525305981000000)*t; + + // [16,18]: 8,22, + tf = CONSTANT(-0.075080816693699995)*f[8]+CONSTANT(0.135045473380000000)*f[22]; + tg = CONSTANT(-0.075080816693699995)*g[8]+CONSTANT(0.135045473380000000)*g[22]; + y[16] += tf*g[18]+tg*f[18]; + y[18] += tf*g[16]+tg*f[16]; + t = f[16]*g[18]+f[18]*g[16]; + y[8] += CONSTANT(-0.075080816693699995)*t; + y[22] += CONSTANT(0.135045473380000000)*t; + + // [16,23]: 19,5, + tf = CONSTANT(-0.119098912754999990)*f[19]+CONSTANT(0.140463346187999990)*f[5]; + tg = CONSTANT(-0.119098912754999990)*g[19]+CONSTANT(0.140463346187999990)*g[5]; + y[16] += tf*g[23]+tg*f[23]; + y[23] += tf*g[16]+tg*f[16]; + t = f[16]*g[23]+f[23]*g[16]; + y[19] += CONSTANT(-0.119098912754999990)*t; + y[5] += CONSTANT(0.140463346187999990)*t; + + // [17,17]: 0,6,20, + tf = CONSTANT(0.282094791768999990)*f[0]+CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20]; + tg = CONSTANT(0.282094791768999990)*g[0]+CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20]; + y[17] += tf*g[17]+tg*f[17]; + t = f[17]*g[17]; + y[0] += CONSTANT(0.282094791768999990)*t; + y[6] += CONSTANT(-0.057343920955899998)*t; + y[20] += CONSTANT(-0.159787958979000000)*t; + + // [17,19]: 8,22,24, + tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(0.119098912753000000)*f[24]; + tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(0.119098912753000000)*g[24]; + y[17] += tf*g[19]+tg*f[19]; + y[19] += tf*g[17]+tg*f[17]; + t = f[17]*g[19]+f[19]*g[17]; + y[8] += CONSTANT(-0.112621225039000000)*t; + y[22] += CONSTANT(0.045015157794100001)*t; + y[24] += CONSTANT(0.119098912753000000)*t; + + // [17,21]: 16,4,18, + tf = CONSTANT(-0.119098912754999990)*f[16]+CONSTANT(-0.112621225039000000)*f[4]+CONSTANT(0.045015157794399997)*f[18]; + tg = CONSTANT(-0.119098912754999990)*g[16]+CONSTANT(-0.112621225039000000)*g[4]+CONSTANT(0.045015157794399997)*g[18]; + y[17] += tf*g[21]+tg*f[21]; + y[21] += tf*g[17]+tg*f[17]; + t = f[17]*g[21]+f[21]*g[17]; + y[16] += CONSTANT(-0.119098912754999990)*t; + y[4] += CONSTANT(-0.112621225039000000)*t; + y[18] += CONSTANT(0.045015157794399997)*t; + + // [18,18]: 6,0,20,24, + tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(-0.135045473384000000)*f[24]; + tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(-0.135045473384000000)*g[24]; + y[18] += tf*g[18]+tg*f[18]; + t = f[18]*g[18]; + y[6] += CONSTANT(0.065535909662600006)*t; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.083698454702400005)*t; + y[24] += CONSTANT(-0.135045473384000000)*t; + + // [18,19]: 7,21,23, + tf = CONSTANT(0.090297865407399994)*f[7]+CONSTANT(0.102084782359000000)*f[21]+CONSTANT(-0.045015157794399997)*f[23]; + tg = CONSTANT(0.090297865407399994)*g[7]+CONSTANT(0.102084782359000000)*g[21]+CONSTANT(-0.045015157794399997)*g[23]; + y[18] += tf*g[19]+tg*f[19]; + y[19] += tf*g[18]+tg*f[18]; + t = f[18]*g[19]+f[19]*g[18]; + y[7] += CONSTANT(0.090297865407399994)*t; + y[21] += CONSTANT(0.102084782359000000)*t; + y[23] += CONSTANT(-0.045015157794399997)*t; + + // [19,19]: 6,8,0,20,22, + tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(-0.141889406570999990)*f[8]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(-0.102084782360000000)*f[22]; + tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(-0.141889406570999990)*g[8]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(-0.102084782360000000)*g[22]; + y[19] += tf*g[19]+tg*f[19]; + t = f[19]*g[19]; + y[6] += CONSTANT(0.139263808033999990)*t; + y[8] += CONSTANT(-0.141889406570999990)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[20] += CONSTANT(0.068480553847200004)*t; + y[22] += CONSTANT(-0.102084782360000000)*t; + + // [20,20]: 6,0,20, + tf = CONSTANT(0.163839797503000010)*f[6]+CONSTANT(0.282094802232000010)*f[0]; + tg = CONSTANT(0.163839797503000010)*g[6]+CONSTANT(0.282094802232000010)*g[0]; + y[20] += tf*g[20]+tg*f[20]; + t = f[20]*g[20]; + y[6] += CONSTANT(0.163839797503000010)*t; + y[0] += CONSTANT(0.282094802232000010)*t; + y[20] += CONSTANT(0.136961139005999990)*t; + + // [21,21]: 6,20,0,8,22, + tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.141889406570999990)*f[8]+CONSTANT(0.102084782360000000)*f[22]; + tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.141889406570999990)*g[8]+CONSTANT(0.102084782360000000)*g[22]; + y[21] += tf*g[21]+tg*f[21]; + t = f[21]*g[21]; + y[6] += CONSTANT(0.139263808033999990)*t; + y[20] += CONSTANT(0.068480553847200004)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.141889406570999990)*t; + y[22] += CONSTANT(0.102084782360000000)*t; + + // [21,23]: 8,22,24, + tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(-0.119098912753000000)*f[24]; + tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(-0.119098912753000000)*g[24]; + y[21] += tf*g[23]+tg*f[23]; + y[23] += tf*g[21]+tg*f[21]; + t = f[21]*g[23]+f[23]*g[21]; + y[8] += CONSTANT(-0.112621225039000000)*t; + y[22] += CONSTANT(0.045015157794100001)*t; + y[24] += CONSTANT(-0.119098912753000000)*t; + + // [22,22]: 6,20,0,24, + tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(0.135045473384000000)*f[24]; + tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(0.135045473384000000)*g[24]; + y[22] += tf*g[22]+tg*f[22]; + t = f[22]*g[22]; + y[6] += CONSTANT(0.065535909662600006)*t; + y[20] += CONSTANT(-0.083698454702400005)*t; + y[0] += CONSTANT(0.282094791771999980)*t; + y[24] += CONSTANT(0.135045473384000000)*t; + + // [23,23]: 6,20,0, + tf = CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20]+CONSTANT(0.282094791768999990)*f[0]; + tg = CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20]+CONSTANT(0.282094791768999990)*g[0]; + y[23] += tf*g[23]+tg*f[23]; + t = f[23]*g[23]; + y[6] += CONSTANT(-0.057343920955899998)*t; + y[20] += CONSTANT(-0.159787958979000000)*t; + y[0] += CONSTANT(0.282094791768999990)*t; + + // [24,24]: 6,0,20, + tf = CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.282094791763999990)*f[0]+CONSTANT(0.106525305981000000)*f[20]; + tg = CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.282094791763999990)*g[0]+CONSTANT(0.106525305981000000)*g[20]; + y[24] += tf*g[24]+tg*f[24]; + t = f[24]*g[24]; + y[6] += CONSTANT(-0.229375683829000000)*t; + y[0] += CONSTANT(0.282094791763999990)*t; + y[20] += CONSTANT(0.106525305981000000)*t; + + // multiply count=1135 + + return y; +} + + +//------------------------------------------------------------------------------------- +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb232909.aspx +//------------------------------------------------------------------------------------- +float* XMSHMultiply6( _Out_writes_(36) float *y, + _In_reads_(36) const float *f, + _In_reads_(36) const float *g ) +{ + if ( !y || !f || !g ) + return nullptr; + + REAL tf,tg,t; + // [0,0]: 0, + y[0] = CONSTANT(0.282094792935999980)*f[0]*g[0]; + + // [1,1]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(-0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(-0.218509686119999990)*g[8]; + y[1] = tf*g[1]+tg*f[1]; + t = f[1]*g[1]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] = CONSTANT(-0.126156626101000010)*t; + y[8] = CONSTANT(-0.218509686119999990)*t; + + // [1,4]: 3,13,15, + tf = CONSTANT(0.218509686114999990)*f[3]+CONSTANT(-0.058399170082300000)*f[13]+CONSTANT(-0.226179013157999990)*f[15]; + tg = CONSTANT(0.218509686114999990)*g[3]+CONSTANT(-0.058399170082300000)*g[13]+CONSTANT(-0.226179013157999990)*g[15]; + y[1] += tf*g[4]+tg*f[4]; + y[4] = tf*g[1]+tg*f[1]; + t = f[1]*g[4]+f[4]*g[1]; + y[3] = CONSTANT(0.218509686114999990)*t; + y[13] = CONSTANT(-0.058399170082300000)*t; + y[15] = CONSTANT(-0.226179013157999990)*t; + + // [1,5]: 2,12, + tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]; + tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]; + y[1] += tf*g[5]+tg*f[5]; + y[5] = tf*g[1]+tg*f[1]; + t = f[1]*g[5]+f[5]*g[1]; + y[2] = CONSTANT(0.218509686118000010)*t; + y[12] = CONSTANT(-0.143048168103000000)*t; + + // [1,11]: 6,8,20,22, + tf = CONSTANT(0.202300659402999990)*f[6]+CONSTANT(0.058399170081799998)*f[8]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(-0.168583882836999990)*f[22]; + tg = CONSTANT(0.202300659402999990)*g[6]+CONSTANT(0.058399170081799998)*g[8]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(-0.168583882836999990)*g[22]; + y[1] += tf*g[11]+tg*f[11]; + y[11] = tf*g[1]+tg*f[1]; + t = f[1]*g[11]+f[11]*g[1]; + y[6] += CONSTANT(0.202300659402999990)*t; + y[8] += CONSTANT(0.058399170081799998)*t; + y[20] = CONSTANT(-0.150786008773000000)*t; + y[22] = CONSTANT(-0.168583882836999990)*t; + + // [1,16]: 15,33,35, + tf = CONSTANT(0.230329432973999990)*f[15]+CONSTANT(-0.034723468517399998)*f[33]+CONSTANT(-0.232932108051999990)*f[35]; + tg = CONSTANT(0.230329432973999990)*g[15]+CONSTANT(-0.034723468517399998)*g[33]+CONSTANT(-0.232932108051999990)*g[35]; + y[1] += tf*g[16]+tg*f[16]; + y[16] = tf*g[1]+tg*f[1]; + t = f[1]*g[16]+f[16]*g[1]; + y[15] += CONSTANT(0.230329432973999990)*t; + y[33] = CONSTANT(-0.034723468517399998)*t; + y[35] = CONSTANT(-0.232932108051999990)*t; + + // [1,18]: 15,13,31,33, + tf = CONSTANT(0.043528171377799997)*f[15]+CONSTANT(0.168583882834000000)*f[13]+CONSTANT(-0.085054779966799998)*f[31]+CONSTANT(-0.183739324705999990)*f[33]; + tg = CONSTANT(0.043528171377799997)*g[15]+CONSTANT(0.168583882834000000)*g[13]+CONSTANT(-0.085054779966799998)*g[31]+CONSTANT(-0.183739324705999990)*g[33]; + y[1] += tf*g[18]+tg*f[18]; + y[18] = tf*g[1]+tg*f[1]; + t = f[1]*g[18]+f[18]*g[1]; + y[15] += CONSTANT(0.043528171377799997)*t; + y[13] += CONSTANT(0.168583882834000000)*t; + y[31] = CONSTANT(-0.085054779966799998)*t; + y[33] += CONSTANT(-0.183739324705999990)*t; + + // [1,19]: 14,12,30,32, + tf = CONSTANT(0.075393004386399995)*f[14]+CONSTANT(0.194663900273000010)*f[12]+CONSTANT(-0.155288072037000010)*f[30]+CONSTANT(-0.159122922869999990)*f[32]; + tg = CONSTANT(0.075393004386399995)*g[14]+CONSTANT(0.194663900273000010)*g[12]+CONSTANT(-0.155288072037000010)*g[30]+CONSTANT(-0.159122922869999990)*g[32]; + y[1] += tf*g[19]+tg*f[19]; + y[19] = tf*g[1]+tg*f[1]; + t = f[1]*g[19]+f[19]*g[1]; + y[14] = CONSTANT(0.075393004386399995)*t; + y[12] += CONSTANT(0.194663900273000010)*t; + y[30] = CONSTANT(-0.155288072037000010)*t; + y[32] = CONSTANT(-0.159122922869999990)*t; + + // [1,24]: 9,25,27, + tf = CONSTANT(-0.230329432978999990)*f[9]+CONSTANT(0.232932108049000000)*f[25]+CONSTANT(0.034723468517100002)*f[27]; + tg = CONSTANT(-0.230329432978999990)*g[9]+CONSTANT(0.232932108049000000)*g[25]+CONSTANT(0.034723468517100002)*g[27]; + y[1] += tf*g[24]+tg*f[24]; + y[24] = tf*g[1]+tg*f[1]; + t = f[1]*g[24]+f[24]*g[1]; + y[9] = CONSTANT(-0.230329432978999990)*t; + y[25] = CONSTANT(0.232932108049000000)*t; + y[27] = CONSTANT(0.034723468517100002)*t; + + // [1,29]: 22,20, + tf = CONSTANT(0.085054779965999999)*f[22]+CONSTANT(0.190188269815000010)*f[20]; + tg = CONSTANT(0.085054779965999999)*g[22]+CONSTANT(0.190188269815000010)*g[20]; + y[1] += tf*g[29]+tg*f[29]; + y[29] = tf*g[1]+tg*f[1]; + t = f[1]*g[29]+f[29]*g[1]; + y[22] += CONSTANT(0.085054779965999999)*t; + y[20] += CONSTANT(0.190188269815000010)*t; + + // [2,2]: 0,6, + tf = CONSTANT(0.282094795249000000)*f[0]+CONSTANT(0.252313259986999990)*f[6]; + tg = CONSTANT(0.282094795249000000)*g[0]+CONSTANT(0.252313259986999990)*g[6]; + y[2] += tf*g[2]+tg*f[2]; + t = f[2]*g[2]; + y[0] += CONSTANT(0.282094795249000000)*t; + y[6] += CONSTANT(0.252313259986999990)*t; + + // [2,12]: 6,20, + tf = CONSTANT(0.247766706973999990)*f[6]+CONSTANT(0.246232537174000010)*f[20]; + tg = CONSTANT(0.247766706973999990)*g[6]+CONSTANT(0.246232537174000010)*g[20]; + y[2] += tf*g[12]+tg*f[12]; + y[12] += tf*g[2]+tg*f[2]; + t = f[2]*g[12]+f[12]*g[2]; + y[6] += CONSTANT(0.247766706973999990)*t; + y[20] += CONSTANT(0.246232537174000010)*t; + + // [2,20]: 30, + tf = CONSTANT(0.245532020560000010)*f[30]; + tg = CONSTANT(0.245532020560000010)*g[30]; + y[2] += tf*g[20]+tg*f[20]; + y[20] += tf*g[2]+tg*f[2]; + t = f[2]*g[20]+f[20]*g[2]; + y[30] += CONSTANT(0.245532020560000010)*t; + + // [3,3]: 0,6,8, + tf = CONSTANT(0.282094791773000010)*f[0]+CONSTANT(-0.126156626101000010)*f[6]+CONSTANT(0.218509686119999990)*f[8]; + tg = CONSTANT(0.282094791773000010)*g[0]+CONSTANT(-0.126156626101000010)*g[6]+CONSTANT(0.218509686119999990)*g[8]; + y[3] += tf*g[3]+tg*f[3]; + t = f[3]*g[3]; + y[0] += CONSTANT(0.282094791773000010)*t; + y[6] += CONSTANT(-0.126156626101000010)*t; + y[8] += CONSTANT(0.218509686119999990)*t; + + // [3,7]: 2,12, + tf = CONSTANT(0.218509686118000010)*f[2]+CONSTANT(-0.143048168103000000)*f[12]; + tg = CONSTANT(0.218509686118000010)*g[2]+CONSTANT(-0.143048168103000000)*g[12]; + y[3] += tf*g[7]+tg*f[7]; + y[7] = tf*g[3]+tg*f[3]; + t = f[3]*g[7]+f[7]*g[3]; + y[2] += CONSTANT(0.218509686118000010)*t; + y[12] += CONSTANT(-0.143048168103000000)*t; + + // [3,13]: 8,6,20,22, + tf = CONSTANT(-0.058399170081799998)*f[8]+CONSTANT(0.202300659402999990)*f[6]+CONSTANT(-0.150786008773000000)*f[20]+CONSTANT(0.168583882836999990)*f[22]; + tg = CONSTANT(-0.058399170081799998)*g[8]+CONSTANT(0.202300659402999990)*g[6]+CONSTANT(-0.150786008773000000)*g[20]+CONSTANT(0.168583882836999990)*g[22]; + y[3] += tf*g[13]+tg*f[13]; + y[13] += tf*g[3]+tg*f[3]; + t = f[3]*g[13]+f[13]*g[3]; + y[8] += CONSTANT(-0.058399170081799998)*t; + y[6] += CONSTANT(0.202300659402999990)*t; + y[20] += CONSTANT(-0.150786008773000000)*t; + y[22] += CONSTANT(0.168583882836999990)*t; + + // [3,16]: 9,25,27, + tf = CONSTANT(0.230329432973999990)*f[9]+CONSTANT(0.232932108051999990)*f[25]+CONSTANT(-0.034723468517399998)*f[27]; + tg = CONSTANT(0.230329432973999990)*g[9]+CONSTANT(0.232932108051999990)*g[25]+CONSTANT(-0.034723468517399998)*g[27]; + y[3] += tf*g[16]+tg*f[16]; + y[16] += tf*g[3]+tg*f[3]; + t = f[3]*g[16]+f[16]*g[3]; + y[9] += CONSTANT(0.230329432973999990)*t; + y[25] += CONSTANT(0.232932108051999990)*t; + y[27] += CONSTANT(-0.034723468517399998)*t; + + // [3,21]: 12,14,30,32, + tf = CONSTANT(0.194663900273000010)*f[12]+CONSTANT(-0.075393004386399995)*f[14]+CONSTANT(-0.155288072037000010)*f[30]+CONSTANT(0.159122922869999990)*f[32]; + tg = CONSTANT(0.194663900273000010)*g[12]+CONSTANT(-0.075393004386399995)*g[14]+CONSTANT(-0.155288072037000010)*g[30]+CONSTANT(0.159122922869999990)*g[32]; + y[3] += tf*g[21]+tg*f[21]; + y[21] = tf*g[3]+tg*f[3]; + t = f[3]*g[21]+f[21]*g[3]; + y[12] += CONSTANT(0.194663900273000010)*t; + y[14] += CONSTANT(-0.075393004386399995)*t; + y[30] += CONSTANT(-0.155288072037000010)*t; + y[32] += CONSTANT(0.159122922869999990)*t; + + // [3,24]: 15,33,35, + tf = CONSTANT(0.230329432978999990)*f[15]+CONSTANT(-0.034723468517100002)*f[33]+CONSTANT(0.232932108049000000)*f[35]; + tg = CONSTANT(0.230329432978999990)*g[15]+CONSTANT(-0.034723468517100002)*g[33]+CONSTANT(0.232932108049000000)*g[35]; + y[3] += tf*g[24]+tg*f[24]; + y[24] += tf*g[3]+tg*f[3]; + t = f[3]*g[24]+f[24]*g[3]; + y[15] += CONSTANT(0.230329432978999990)*t; + y[33] += CONSTANT(-0.034723468517100002)*t; + y[35] += CONSTANT(0.232932108049000000)*t; + + // [3,31]: 20,22, + tf = CONSTANT(0.190188269815000010)*f[20]+CONSTANT(-0.085054779965999999)*f[22]; + tg = CONSTANT(0.190188269815000010)*g[20]+CONSTANT(-0.085054779965999999)*g[22]; + y[3] += tf*g[31]+tg*f[31]; + y[31] += tf*g[3]+tg*f[3]; + t = f[3]*g[31]+f[31]*g[3]; + y[20] += CONSTANT(0.190188269815000010)*t; + y[22] += CONSTANT(-0.085054779965999999)*t; + + // [4,4]: 0,6,20,24, + tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(-0.238413613505999990)*f[24]; + tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(-0.238413613505999990)*g[24]; + y[4] += tf*g[4]+tg*f[4]; + t = f[4]*g[4]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + y[20] += CONSTANT(0.040299255967500003)*t; + y[24] += CONSTANT(-0.238413613505999990)*t; + + // [4,5]: 7,21,23, + tf = CONSTANT(0.156078347226000000)*f[7]+CONSTANT(-0.063718718434399996)*f[21]+CONSTANT(-0.168583882835000000)*f[23]; + tg = CONSTANT(0.156078347226000000)*g[7]+CONSTANT(-0.063718718434399996)*g[21]+CONSTANT(-0.168583882835000000)*g[23]; + y[4] += tf*g[5]+tg*f[5]; + y[5] += tf*g[4]+tg*f[4]; + t = f[4]*g[5]+f[5]*g[4]; + y[7] += CONSTANT(0.156078347226000000)*t; + y[21] += CONSTANT(-0.063718718434399996)*t; + y[23] = CONSTANT(-0.168583882835000000)*t; + + // [4,9]: 3,13,31,35, + tf = CONSTANT(0.226179013157999990)*f[3]+CONSTANT(-0.094031597258400004)*f[13]+CONSTANT(0.016943317729299998)*f[31]+CONSTANT(-0.245532000542000000)*f[35]; + tg = CONSTANT(0.226179013157999990)*g[3]+CONSTANT(-0.094031597258400004)*g[13]+CONSTANT(0.016943317729299998)*g[31]+CONSTANT(-0.245532000542000000)*g[35]; + y[4] += tf*g[9]+tg*f[9]; + y[9] += tf*g[4]+tg*f[4]; + t = f[4]*g[9]+f[9]*g[4]; + y[3] += CONSTANT(0.226179013157999990)*t; + y[13] += CONSTANT(-0.094031597258400004)*t; + y[31] += CONSTANT(0.016943317729299998)*t; + y[35] += CONSTANT(-0.245532000542000000)*t; + + // [4,10]: 2,12,30,34, + tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12]+CONSTANT(0.053579475144400000)*f[30]+CONSTANT(-0.190188269816000010)*f[34]; + tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12]+CONSTANT(0.053579475144400000)*g[30]+CONSTANT(-0.190188269816000010)*g[34]; + y[4] += tf*g[10]+tg*f[10]; + y[10] = tf*g[4]+tg*f[4]; + t = f[4]*g[10]+f[10]*g[4]; + y[2] += CONSTANT(0.184674390919999990)*t; + y[12] += CONSTANT(-0.188063194517999990)*t; + y[30] += CONSTANT(0.053579475144400000)*t; + y[34] = CONSTANT(-0.190188269816000010)*t; + + // [4,11]: 3,13,15,31,33, + tf = CONSTANT(-0.058399170082300000)*f[3]+CONSTANT(0.145673124078000010)*f[13]+CONSTANT(0.094031597258400004)*f[15]+CONSTANT(-0.065621187395699998)*f[31]+CONSTANT(-0.141757966610000010)*f[33]; + tg = CONSTANT(-0.058399170082300000)*g[3]+CONSTANT(0.145673124078000010)*g[13]+CONSTANT(0.094031597258400004)*g[15]+CONSTANT(-0.065621187395699998)*g[31]+CONSTANT(-0.141757966610000010)*g[33]; + y[4] += tf*g[11]+tg*f[11]; + y[11] += tf*g[4]+tg*f[4]; + t = f[4]*g[11]+f[11]*g[4]; + y[3] += CONSTANT(-0.058399170082300000)*t; + y[13] += CONSTANT(0.145673124078000010)*t; + y[15] += CONSTANT(0.094031597258400004)*t; + y[31] += CONSTANT(-0.065621187395699998)*t; + y[33] += CONSTANT(-0.141757966610000010)*t; + + // [4,16]: 8,22, + tf = CONSTANT(0.238413613494000000)*f[8]+CONSTANT(-0.075080816693699995)*f[22]; + tg = CONSTANT(0.238413613494000000)*g[8]+CONSTANT(-0.075080816693699995)*g[22]; + y[4] += tf*g[16]+tg*f[16]; + y[16] += tf*g[4]+tg*f[4]; + t = f[4]*g[16]+f[16]*g[4]; + y[8] += CONSTANT(0.238413613494000000)*t; + y[22] += CONSTANT(-0.075080816693699995)*t; + + // [4,18]: 6,20,24, + tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(0.075080816691500005)*f[24]; + tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(0.075080816691500005)*g[24]; + y[4] += tf*g[18]+tg*f[18]; + y[18] += tf*g[4]+tg*f[4]; + t = f[4]*g[18]+f[18]*g[4]; + y[6] += CONSTANT(0.156078347226000000)*t; + y[20] += CONSTANT(-0.190364615029000010)*t; + y[24] += CONSTANT(0.075080816691500005)*t; + + // [4,19]: 7,21,23, + tf = CONSTANT(-0.063718718434399996)*f[7]+CONSTANT(0.141889406569999990)*f[21]+CONSTANT(0.112621225039000000)*f[23]; + tg = CONSTANT(-0.063718718434399996)*g[7]+CONSTANT(0.141889406569999990)*g[21]+CONSTANT(0.112621225039000000)*g[23]; + y[4] += tf*g[19]+tg*f[19]; + y[19] += tf*g[4]+tg*f[4]; + t = f[4]*g[19]+f[19]*g[4]; + y[7] += CONSTANT(-0.063718718434399996)*t; + y[21] += CONSTANT(0.141889406569999990)*t; + y[23] += CONSTANT(0.112621225039000000)*t; + + // [4,25]: 15,33, + tf = CONSTANT(0.245532000542000000)*f[15]+CONSTANT(-0.062641347680800000)*f[33]; + tg = CONSTANT(0.245532000542000000)*g[15]+CONSTANT(-0.062641347680800000)*g[33]; + y[4] += tf*g[25]+tg*f[25]; + y[25] += tf*g[4]+tg*f[4]; + t = f[4]*g[25]+f[25]*g[4]; + y[15] += CONSTANT(0.245532000542000000)*t; + y[33] += CONSTANT(-0.062641347680800000)*t; + + // [4,26]: 14,32, + tf = CONSTANT(0.190188269806999990)*f[14]+CONSTANT(-0.097043558542400002)*f[32]; + tg = CONSTANT(0.190188269806999990)*g[14]+CONSTANT(-0.097043558542400002)*g[32]; + y[4] += tf*g[26]+tg*f[26]; + y[26] = tf*g[4]+tg*f[4]; + t = f[4]*g[26]+f[26]*g[4]; + y[14] += CONSTANT(0.190188269806999990)*t; + y[32] += CONSTANT(-0.097043558542400002)*t; + + // [4,27]: 13,31,35, + tf = CONSTANT(0.141757966610000010)*f[13]+CONSTANT(-0.121034582549000000)*f[31]+CONSTANT(0.062641347680800000)*f[35]; + tg = CONSTANT(0.141757966610000010)*g[13]+CONSTANT(-0.121034582549000000)*g[31]+CONSTANT(0.062641347680800000)*g[35]; + y[4] += tf*g[27]+tg*f[27]; + y[27] += tf*g[4]+tg*f[4]; + t = f[4]*g[27]+f[27]*g[4]; + y[13] += CONSTANT(0.141757966610000010)*t; + y[31] += CONSTANT(-0.121034582549000000)*t; + y[35] += CONSTANT(0.062641347680800000)*t; + + // [4,28]: 12,30,34, + tf = CONSTANT(0.141757966609000000)*f[12]+CONSTANT(-0.191372478254000000)*f[30]+CONSTANT(0.097043558538899996)*f[34]; + tg = CONSTANT(0.141757966609000000)*g[12]+CONSTANT(-0.191372478254000000)*g[30]+CONSTANT(0.097043558538899996)*g[34]; + y[4] += tf*g[28]+tg*f[28]; + y[28] = tf*g[4]+tg*f[4]; + t = f[4]*g[28]+f[28]*g[4]; + y[12] += CONSTANT(0.141757966609000000)*t; + y[30] += CONSTANT(-0.191372478254000000)*t; + y[34] += CONSTANT(0.097043558538899996)*t; + + // [4,29]: 13,15,31,33, + tf = CONSTANT(-0.065621187395699998)*f[13]+CONSTANT(-0.016943317729299998)*f[15]+CONSTANT(0.140070311613999990)*f[31]+CONSTANT(0.121034582549000000)*f[33]; + tg = CONSTANT(-0.065621187395699998)*g[13]+CONSTANT(-0.016943317729299998)*g[15]+CONSTANT(0.140070311613999990)*g[31]+CONSTANT(0.121034582549000000)*g[33]; + y[4] += tf*g[29]+tg*f[29]; + y[29] += tf*g[4]+tg*f[4]; + t = f[4]*g[29]+f[29]*g[4]; + y[13] += CONSTANT(-0.065621187395699998)*t; + y[15] += CONSTANT(-0.016943317729299998)*t; + y[31] += CONSTANT(0.140070311613999990)*t; + y[33] += CONSTANT(0.121034582549000000)*t; + + // [5,5]: 0,6,8,20,22, + tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.090111875786499998)*f[6]+CONSTANT(-0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(-0.180223751574000000)*f[22]; + tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.090111875786499998)*g[6]+CONSTANT(-0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(-0.180223751574000000)*g[22]; + y[5] += tf*g[5]+tg*f[5]; + t = f[5]*g[5]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.090111875786499998)*t; + y[8] += CONSTANT(-0.156078347227999990)*t; + y[20] += CONSTANT(-0.161197023870999990)*t; + y[22] += CONSTANT(-0.180223751574000000)*t; + + // [5,10]: 3,13,15,31,33, + tf = CONSTANT(0.184674390919999990)*f[3]+CONSTANT(0.115164716490000000)*f[13]+CONSTANT(-0.148677009678999990)*f[15]+CONSTANT(-0.083004965974099995)*f[31]+CONSTANT(-0.179311220383999990)*f[33]; + tg = CONSTANT(0.184674390919999990)*g[3]+CONSTANT(0.115164716490000000)*g[13]+CONSTANT(-0.148677009678999990)*g[15]+CONSTANT(-0.083004965974099995)*g[31]+CONSTANT(-0.179311220383999990)*g[33]; + y[5] += tf*g[10]+tg*f[10]; + y[10] += tf*g[5]+tg*f[5]; + t = f[5]*g[10]+f[10]*g[5]; + y[3] += CONSTANT(0.184674390919999990)*t; + y[13] += CONSTANT(0.115164716490000000)*t; + y[15] += CONSTANT(-0.148677009678999990)*t; + y[31] += CONSTANT(-0.083004965974099995)*t; + y[33] += CONSTANT(-0.179311220383999990)*t; + + // [5,11]: 2,12,14,30,32, + tf = CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.059470803871800003)*f[12]+CONSTANT(-0.115164716491000000)*f[14]+CONSTANT(-0.169433177294000010)*f[30]+CONSTANT(-0.173617342585000000)*f[32]; + tg = CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.059470803871800003)*g[12]+CONSTANT(-0.115164716491000000)*g[14]+CONSTANT(-0.169433177294000010)*g[30]+CONSTANT(-0.173617342585000000)*g[32]; + y[5] += tf*g[11]+tg*f[11]; + y[11] += tf*g[5]+tg*f[5]; + t = f[5]*g[11]+f[11]*g[5]; + y[2] += CONSTANT(0.233596680327000010)*t; + y[12] += CONSTANT(0.059470803871800003)*t; + y[14] += CONSTANT(-0.115164716491000000)*t; + y[30] += CONSTANT(-0.169433177294000010)*t; + y[32] += CONSTANT(-0.173617342585000000)*t; + + // [5,14]: 9,1,27,29, + tf = CONSTANT(0.148677009677999990)*f[9]+CONSTANT(-0.184674390923000000)*f[1]+CONSTANT(0.179311220382000010)*f[27]+CONSTANT(0.083004965973399999)*f[29]; + tg = CONSTANT(0.148677009677999990)*g[9]+CONSTANT(-0.184674390923000000)*g[1]+CONSTANT(0.179311220382000010)*g[27]+CONSTANT(0.083004965973399999)*g[29]; + y[5] += tf*g[14]+tg*f[14]; + y[14] += tf*g[5]+tg*f[5]; + t = f[5]*g[14]+f[14]*g[5]; + y[9] += CONSTANT(0.148677009677999990)*t; + y[1] += CONSTANT(-0.184674390923000000)*t; + y[27] += CONSTANT(0.179311220382000010)*t; + y[29] += CONSTANT(0.083004965973399999)*t; + + // [5,17]: 8,22,24, + tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(-0.140463346189000000)*f[24]; + tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(-0.140463346189000000)*g[24]; + y[5] += tf*g[17]+tg*f[17]; + y[17] = tf*g[5]+tg*f[5]; + t = f[5]*g[17]+f[17]*g[5]; + y[8] += CONSTANT(0.168583882832999990)*t; + y[22] += CONSTANT(0.132725386548000010)*t; + y[24] += CONSTANT(-0.140463346189000000)*t; + + // [5,18]: 7,21,23, + tf = CONSTANT(0.180223751571000010)*f[7]+CONSTANT(0.090297865407399994)*f[21]+CONSTANT(-0.132725386549000010)*f[23]; + tg = CONSTANT(0.180223751571000010)*g[7]+CONSTANT(0.090297865407399994)*g[21]+CONSTANT(-0.132725386549000010)*g[23]; + y[5] += tf*g[18]+tg*f[18]; + y[18] += tf*g[5]+tg*f[5]; + t = f[5]*g[18]+f[18]*g[5]; + y[7] += CONSTANT(0.180223751571000010)*t; + y[21] += CONSTANT(0.090297865407399994)*t; + y[23] += CONSTANT(-0.132725386549000010)*t; + + // [5,19]: 6,8,20,22, + tf = CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(-0.090297865408399999)*f[22]; + tg = CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(-0.090297865408399999)*g[22]; + y[5] += tf*g[19]+tg*f[19]; + y[19] += tf*g[5]+tg*f[5]; + t = f[5]*g[19]+f[19]*g[5]; + y[6] += CONSTANT(0.220728115440999990)*t; + y[8] += CONSTANT(0.063718718433900007)*t; + y[20] += CONSTANT(0.044869370061299998)*t; + y[22] += CONSTANT(-0.090297865408399999)*t; + + // [5,26]: 15,33,35, + tf = CONSTANT(0.155288072035000000)*f[15]+CONSTANT(0.138662534056999990)*f[33]+CONSTANT(-0.132882365179999990)*f[35]; + tg = CONSTANT(0.155288072035000000)*g[15]+CONSTANT(0.138662534056999990)*g[33]+CONSTANT(-0.132882365179999990)*g[35]; + y[5] += tf*g[26]+tg*f[26]; + y[26] += tf*g[5]+tg*f[5]; + t = f[5]*g[26]+f[26]*g[5]; + y[15] += CONSTANT(0.155288072035000000)*t; + y[33] += CONSTANT(0.138662534056999990)*t; + y[35] += CONSTANT(-0.132882365179999990)*t; + + // [5,28]: 15,13,31,33, + tf = CONSTANT(0.044827805096399997)*f[15]+CONSTANT(0.173617342584000000)*f[13]+CONSTANT(0.074118242118699995)*f[31]+CONSTANT(-0.114366930522000000)*f[33]; + tg = CONSTANT(0.044827805096399997)*g[15]+CONSTANT(0.173617342584000000)*g[13]+CONSTANT(0.074118242118699995)*g[31]+CONSTANT(-0.114366930522000000)*g[33]; + y[5] += tf*g[28]+tg*f[28]; + y[28] += tf*g[5]+tg*f[5]; + t = f[5]*g[28]+f[28]*g[5]; + y[15] += CONSTANT(0.044827805096399997)*t; + y[13] += CONSTANT(0.173617342584000000)*t; + y[31] += CONSTANT(0.074118242118699995)*t; + y[33] += CONSTANT(-0.114366930522000000)*t; + + // [5,29]: 12,30,32, + tf = CONSTANT(0.214317900578999990)*f[12]+CONSTANT(0.036165998945399999)*f[30]+CONSTANT(-0.074118242119099995)*f[32]; + tg = CONSTANT(0.214317900578999990)*g[12]+CONSTANT(0.036165998945399999)*g[30]+CONSTANT(-0.074118242119099995)*g[32]; + y[5] += tf*g[29]+tg*f[29]; + y[29] += tf*g[5]+tg*f[5]; + t = f[5]*g[29]+f[29]*g[5]; + y[12] += CONSTANT(0.214317900578999990)*t; + y[30] += CONSTANT(0.036165998945399999)*t; + y[32] += CONSTANT(-0.074118242119099995)*t; + + // [5,32]: 9,27, + tf = CONSTANT(-0.044827805096799997)*f[9]+CONSTANT(0.114366930522000000)*f[27]; + tg = CONSTANT(-0.044827805096799997)*g[9]+CONSTANT(0.114366930522000000)*g[27]; + y[5] += tf*g[32]+tg*f[32]; + y[32] += tf*g[5]+tg*f[5]; + t = f[5]*g[32]+f[32]*g[5]; + y[9] += CONSTANT(-0.044827805096799997)*t; + y[27] += CONSTANT(0.114366930522000000)*t; + + // [5,34]: 9,27,25, + tf = CONSTANT(-0.155288072036000010)*f[9]+CONSTANT(-0.138662534059000000)*f[27]+CONSTANT(0.132882365179000010)*f[25]; + tg = CONSTANT(-0.155288072036000010)*g[9]+CONSTANT(-0.138662534059000000)*g[27]+CONSTANT(0.132882365179000010)*g[25]; + y[5] += tf*g[34]+tg*f[34]; + y[34] += tf*g[5]+tg*f[5]; + t = f[5]*g[34]+f[34]*g[5]; + y[9] += CONSTANT(-0.155288072036000010)*t; + y[27] += CONSTANT(-0.138662534059000000)*t; + y[25] += CONSTANT(0.132882365179000010)*t; + + // [6,6]: 0,6,20, + tf = CONSTANT(0.282094797560000000)*f[0]+CONSTANT(0.241795553185999990)*f[20]; + tg = CONSTANT(0.282094797560000000)*g[0]+CONSTANT(0.241795553185999990)*g[20]; + y[6] += tf*g[6]+tg*f[6]; + t = f[6]*g[6]; + y[0] += CONSTANT(0.282094797560000000)*t; + y[6] += CONSTANT(0.180223764527000010)*t; + y[20] += CONSTANT(0.241795553185999990)*t; + + // [7,7]: 6,0,8,20,22, + tf = CONSTANT(0.090111875786499998)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.156078347227999990)*f[8]+CONSTANT(-0.161197023870999990)*f[20]+CONSTANT(0.180223751574000000)*f[22]; + tg = CONSTANT(0.090111875786499998)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.156078347227999990)*g[8]+CONSTANT(-0.161197023870999990)*g[20]+CONSTANT(0.180223751574000000)*g[22]; + y[7] += tf*g[7]+tg*f[7]; + t = f[7]*g[7]; + y[6] += CONSTANT(0.090111875786499998)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.156078347227999990)*t; + y[20] += CONSTANT(-0.161197023870999990)*t; + y[22] += CONSTANT(0.180223751574000000)*t; + + // [7,10]: 9,1,11,27,29, + tf = CONSTANT(0.148677009678999990)*f[9]+CONSTANT(0.184674390919999990)*f[1]+CONSTANT(0.115164716490000000)*f[11]+CONSTANT(0.179311220383999990)*f[27]+CONSTANT(-0.083004965974099995)*f[29]; + tg = CONSTANT(0.148677009678999990)*g[9]+CONSTANT(0.184674390919999990)*g[1]+CONSTANT(0.115164716490000000)*g[11]+CONSTANT(0.179311220383999990)*g[27]+CONSTANT(-0.083004965974099995)*g[29]; + y[7] += tf*g[10]+tg*f[10]; + y[10] += tf*g[7]+tg*f[7]; + t = f[7]*g[10]+f[10]*g[7]; + y[9] += CONSTANT(0.148677009678999990)*t; + y[1] += CONSTANT(0.184674390919999990)*t; + y[11] += CONSTANT(0.115164716490000000)*t; + y[27] += CONSTANT(0.179311220383999990)*t; + y[29] += CONSTANT(-0.083004965974099995)*t; + + // [7,13]: 12,2,14,30,32, + tf = CONSTANT(0.059470803871800003)*f[12]+CONSTANT(0.233596680327000010)*f[2]+CONSTANT(0.115164716491000000)*f[14]+CONSTANT(-0.169433177294000010)*f[30]+CONSTANT(0.173617342585000000)*f[32]; + tg = CONSTANT(0.059470803871800003)*g[12]+CONSTANT(0.233596680327000010)*g[2]+CONSTANT(0.115164716491000000)*g[14]+CONSTANT(-0.169433177294000010)*g[30]+CONSTANT(0.173617342585000000)*g[32]; + y[7] += tf*g[13]+tg*f[13]; + y[13] += tf*g[7]+tg*f[7]; + t = f[7]*g[13]+f[13]*g[7]; + y[12] += CONSTANT(0.059470803871800003)*t; + y[2] += CONSTANT(0.233596680327000010)*t; + y[14] += CONSTANT(0.115164716491000000)*t; + y[30] += CONSTANT(-0.169433177294000010)*t; + y[32] += CONSTANT(0.173617342585000000)*t; + + // [7,14]: 3,15,31,33, + tf = CONSTANT(0.184674390923000000)*f[3]+CONSTANT(0.148677009677999990)*f[15]+CONSTANT(-0.083004965973399999)*f[31]+CONSTANT(0.179311220382000010)*f[33]; + tg = CONSTANT(0.184674390923000000)*g[3]+CONSTANT(0.148677009677999990)*g[15]+CONSTANT(-0.083004965973399999)*g[31]+CONSTANT(0.179311220382000010)*g[33]; + y[7] += tf*g[14]+tg*f[14]; + y[14] += tf*g[7]+tg*f[7]; + t = f[7]*g[14]+f[14]*g[7]; + y[3] += CONSTANT(0.184674390923000000)*t; + y[15] += CONSTANT(0.148677009677999990)*t; + y[31] += CONSTANT(-0.083004965973399999)*t; + y[33] += CONSTANT(0.179311220382000010)*t; + + // [7,17]: 16,4,18, + tf = CONSTANT(0.140463346187999990)*f[16]+CONSTANT(0.168583882835000000)*f[4]+CONSTANT(0.132725386549000010)*f[18]; + tg = CONSTANT(0.140463346187999990)*g[16]+CONSTANT(0.168583882835000000)*g[4]+CONSTANT(0.132725386549000010)*g[18]; + y[7] += tf*g[17]+tg*f[17]; + y[17] += tf*g[7]+tg*f[7]; + t = f[7]*g[17]+f[17]*g[7]; + y[16] += CONSTANT(0.140463346187999990)*t; + y[4] += CONSTANT(0.168583882835000000)*t; + y[18] += CONSTANT(0.132725386549000010)*t; + + // [7,21]: 8,20,6,22, + tf = CONSTANT(-0.063718718433900007)*f[8]+CONSTANT(0.044869370061299998)*f[20]+CONSTANT(0.220728115440999990)*f[6]+CONSTANT(0.090297865408399999)*f[22]; + tg = CONSTANT(-0.063718718433900007)*g[8]+CONSTANT(0.044869370061299998)*g[20]+CONSTANT(0.220728115440999990)*g[6]+CONSTANT(0.090297865408399999)*g[22]; + y[7] += tf*g[21]+tg*f[21]; + y[21] += tf*g[7]+tg*f[7]; + t = f[7]*g[21]+f[21]*g[7]; + y[8] += CONSTANT(-0.063718718433900007)*t; + y[20] += CONSTANT(0.044869370061299998)*t; + y[6] += CONSTANT(0.220728115440999990)*t; + y[22] += CONSTANT(0.090297865408399999)*t; + + // [7,23]: 8,22,24, + tf = CONSTANT(0.168583882832999990)*f[8]+CONSTANT(0.132725386548000010)*f[22]+CONSTANT(0.140463346189000000)*f[24]; + tg = CONSTANT(0.168583882832999990)*g[8]+CONSTANT(0.132725386548000010)*g[22]+CONSTANT(0.140463346189000000)*g[24]; + y[7] += tf*g[23]+tg*f[23]; + y[23] += tf*g[7]+tg*f[7]; + t = f[7]*g[23]+f[23]*g[7]; + y[8] += CONSTANT(0.168583882832999990)*t; + y[22] += CONSTANT(0.132725386548000010)*t; + y[24] += CONSTANT(0.140463346189000000)*t; + + // [7,26]: 9,25,27, + tf = CONSTANT(0.155288072035000000)*f[9]+CONSTANT(0.132882365179999990)*f[25]+CONSTANT(0.138662534056999990)*f[27]; + tg = CONSTANT(0.155288072035000000)*g[9]+CONSTANT(0.132882365179999990)*g[25]+CONSTANT(0.138662534056999990)*g[27]; + y[7] += tf*g[26]+tg*f[26]; + y[26] += tf*g[7]+tg*f[7]; + t = f[7]*g[26]+f[26]*g[7]; + y[9] += CONSTANT(0.155288072035000000)*t; + y[25] += CONSTANT(0.132882365179999990)*t; + y[27] += CONSTANT(0.138662534056999990)*t; + + // [7,28]: 27,11,9,29, + tf = CONSTANT(0.114366930522000000)*f[27]+CONSTANT(0.173617342584000000)*f[11]+CONSTANT(-0.044827805096399997)*f[9]+CONSTANT(0.074118242118699995)*f[29]; + tg = CONSTANT(0.114366930522000000)*g[27]+CONSTANT(0.173617342584000000)*g[11]+CONSTANT(-0.044827805096399997)*g[9]+CONSTANT(0.074118242118699995)*g[29]; + y[7] += tf*g[28]+tg*f[28]; + y[28] += tf*g[7]+tg*f[7]; + t = f[7]*g[28]+f[28]*g[7]; + y[27] += CONSTANT(0.114366930522000000)*t; + y[11] += CONSTANT(0.173617342584000000)*t; + y[9] += CONSTANT(-0.044827805096399997)*t; + y[29] += CONSTANT(0.074118242118699995)*t; + + // [7,31]: 30,12,32, + tf = CONSTANT(0.036165998945399999)*f[30]+CONSTANT(0.214317900578999990)*f[12]+CONSTANT(0.074118242119099995)*f[32]; + tg = CONSTANT(0.036165998945399999)*g[30]+CONSTANT(0.214317900578999990)*g[12]+CONSTANT(0.074118242119099995)*g[32]; + y[7] += tf*g[31]+tg*f[31]; + y[31] += tf*g[7]+tg*f[7]; + t = f[7]*g[31]+f[31]*g[7]; + y[30] += CONSTANT(0.036165998945399999)*t; + y[12] += CONSTANT(0.214317900578999990)*t; + y[32] += CONSTANT(0.074118242119099995)*t; + + // [7,32]: 15,33, + tf = CONSTANT(-0.044827805096799997)*f[15]+CONSTANT(0.114366930522000000)*f[33]; + tg = CONSTANT(-0.044827805096799997)*g[15]+CONSTANT(0.114366930522000000)*g[33]; + y[7] += tf*g[32]+tg*f[32]; + y[32] += tf*g[7]+tg*f[7]; + t = f[7]*g[32]+f[32]*g[7]; + y[15] += CONSTANT(-0.044827805096799997)*t; + y[33] += CONSTANT(0.114366930522000000)*t; + + // [7,34]: 15,33,35, + tf = CONSTANT(0.155288072036000010)*f[15]+CONSTANT(0.138662534059000000)*f[33]+CONSTANT(0.132882365179000010)*f[35]; + tg = CONSTANT(0.155288072036000010)*g[15]+CONSTANT(0.138662534059000000)*g[33]+CONSTANT(0.132882365179000010)*g[35]; + y[7] += tf*g[34]+tg*f[34]; + y[34] += tf*g[7]+tg*f[7]; + t = f[7]*g[34]+f[34]*g[7]; + y[15] += CONSTANT(0.155288072036000010)*t; + y[33] += CONSTANT(0.138662534059000000)*t; + y[35] += CONSTANT(0.132882365179000010)*t; + + // [8,8]: 0,6,20,24, + tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.180223751576000010)*f[6]+CONSTANT(0.040299255967500003)*f[20]+CONSTANT(0.238413613505999990)*f[24]; + tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.180223751576000010)*g[6]+CONSTANT(0.040299255967500003)*g[20]+CONSTANT(0.238413613505999990)*g[24]; + y[8] += tf*g[8]+tg*f[8]; + t = f[8]*g[8]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[6] += CONSTANT(-0.180223751576000010)*t; + y[20] += CONSTANT(0.040299255967500003)*t; + y[24] += CONSTANT(0.238413613505999990)*t; + + // [8,9]: 1,11,25,29, + tf = CONSTANT(0.226179013155000000)*f[1]+CONSTANT(-0.094031597259499999)*f[11]+CONSTANT(0.245532000541000000)*f[25]+CONSTANT(0.016943317729199998)*f[29]; + tg = CONSTANT(0.226179013155000000)*g[1]+CONSTANT(-0.094031597259499999)*g[11]+CONSTANT(0.245532000541000000)*g[25]+CONSTANT(0.016943317729199998)*g[29]; + y[8] += tf*g[9]+tg*f[9]; + y[9] += tf*g[8]+tg*f[8]; + t = f[8]*g[9]+f[9]*g[8]; + y[1] += CONSTANT(0.226179013155000000)*t; + y[11] += CONSTANT(-0.094031597259499999)*t; + y[25] += CONSTANT(0.245532000541000000)*t; + y[29] += CONSTANT(0.016943317729199998)*t; + + // [8,14]: 2,12,30,34, + tf = CONSTANT(0.184674390919999990)*f[2]+CONSTANT(-0.188063194517999990)*f[12]+CONSTANT(0.053579475144400000)*f[30]+CONSTANT(0.190188269816000010)*f[34]; + tg = CONSTANT(0.184674390919999990)*g[2]+CONSTANT(-0.188063194517999990)*g[12]+CONSTANT(0.053579475144400000)*g[30]+CONSTANT(0.190188269816000010)*g[34]; + y[8] += tf*g[14]+tg*f[14]; + y[14] += tf*g[8]+tg*f[8]; + t = f[8]*g[14]+f[14]*g[8]; + y[2] += CONSTANT(0.184674390919999990)*t; + y[12] += CONSTANT(-0.188063194517999990)*t; + y[30] += CONSTANT(0.053579475144400000)*t; + y[34] += CONSTANT(0.190188269816000010)*t; + + // [8,15]: 13,3,31,35, + tf = CONSTANT(-0.094031597259499999)*f[13]+CONSTANT(0.226179013155000000)*f[3]+CONSTANT(0.016943317729199998)*f[31]+CONSTANT(0.245532000541000000)*f[35]; + tg = CONSTANT(-0.094031597259499999)*g[13]+CONSTANT(0.226179013155000000)*g[3]+CONSTANT(0.016943317729199998)*g[31]+CONSTANT(0.245532000541000000)*g[35]; + y[8] += tf*g[15]+tg*f[15]; + y[15] += tf*g[8]+tg*f[8]; + t = f[8]*g[15]+f[15]*g[8]; + y[13] += CONSTANT(-0.094031597259499999)*t; + y[3] += CONSTANT(0.226179013155000000)*t; + y[31] += CONSTANT(0.016943317729199998)*t; + y[35] += CONSTANT(0.245532000541000000)*t; + + // [8,22]: 6,20,24, + tf = CONSTANT(0.156078347226000000)*f[6]+CONSTANT(-0.190364615029000010)*f[20]+CONSTANT(-0.075080816691500005)*f[24]; + tg = CONSTANT(0.156078347226000000)*g[6]+CONSTANT(-0.190364615029000010)*g[20]+CONSTANT(-0.075080816691500005)*g[24]; + y[8] += tf*g[22]+tg*f[22]; + y[22] += tf*g[8]+tg*f[8]; + t = f[8]*g[22]+f[22]*g[8]; + y[6] += CONSTANT(0.156078347226000000)*t; + y[20] += CONSTANT(-0.190364615029000010)*t; + y[24] += CONSTANT(-0.075080816691500005)*t; + + // [8,26]: 10,28, + tf = CONSTANT(0.190188269806999990)*f[10]+CONSTANT(-0.097043558542400002)*f[28]; + tg = CONSTANT(0.190188269806999990)*g[10]+CONSTANT(-0.097043558542400002)*g[28]; + y[8] += tf*g[26]+tg*f[26]; + y[26] += tf*g[8]+tg*f[8]; + t = f[8]*g[26]+f[26]*g[8]; + y[10] += CONSTANT(0.190188269806999990)*t; + y[28] += CONSTANT(-0.097043558542400002)*t; + + // [8,27]: 25,11,29, + tf = CONSTANT(-0.062641347680800000)*f[25]+CONSTANT(0.141757966609000000)*f[11]+CONSTANT(-0.121034582550000010)*f[29]; + tg = CONSTANT(-0.062641347680800000)*g[25]+CONSTANT(0.141757966609000000)*g[11]+CONSTANT(-0.121034582550000010)*g[29]; + y[8] += tf*g[27]+tg*f[27]; + y[27] += tf*g[8]+tg*f[8]; + t = f[8]*g[27]+f[27]*g[8]; + y[25] += CONSTANT(-0.062641347680800000)*t; + y[11] += CONSTANT(0.141757966609000000)*t; + y[29] += CONSTANT(-0.121034582550000010)*t; + + // [8,32]: 30,12,34, + tf = CONSTANT(-0.191372478254000000)*f[30]+CONSTANT(0.141757966609000000)*f[12]+CONSTANT(-0.097043558538899996)*f[34]; + tg = CONSTANT(-0.191372478254000000)*g[30]+CONSTANT(0.141757966609000000)*g[12]+CONSTANT(-0.097043558538899996)*g[34]; + y[8] += tf*g[32]+tg*f[32]; + y[32] += tf*g[8]+tg*f[8]; + t = f[8]*g[32]+f[32]*g[8]; + y[30] += CONSTANT(-0.191372478254000000)*t; + y[12] += CONSTANT(0.141757966609000000)*t; + y[34] += CONSTANT(-0.097043558538899996)*t; + + // [8,33]: 13,31,35, + tf = CONSTANT(0.141757966609000000)*f[13]+CONSTANT(-0.121034582550000010)*f[31]+CONSTANT(-0.062641347680800000)*f[35]; + tg = CONSTANT(0.141757966609000000)*g[13]+CONSTANT(-0.121034582550000010)*g[31]+CONSTANT(-0.062641347680800000)*g[35]; + y[8] += tf*g[33]+tg*f[33]; + y[33] += tf*g[8]+tg*f[8]; + t = f[8]*g[33]+f[33]*g[8]; + y[13] += CONSTANT(0.141757966609000000)*t; + y[31] += CONSTANT(-0.121034582550000010)*t; + y[35] += CONSTANT(-0.062641347680800000)*t; + + // [9,9]: 6,0,20, + tf = CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.282094791766999970)*f[0]+CONSTANT(0.076934943209800002)*f[20]; + tg = CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.282094791766999970)*g[0]+CONSTANT(0.076934943209800002)*g[20]; + y[9] += tf*g[9]+tg*f[9]; + t = f[9]*g[9]; + y[6] += CONSTANT(-0.210261043508000010)*t; + y[0] += CONSTANT(0.282094791766999970)*t; + y[20] += CONSTANT(0.076934943209800002)*t; + + // [9,17]: 2,12,30, + tf = CONSTANT(0.162867503964999990)*f[2]+CONSTANT(-0.203550726872999990)*f[12]+CONSTANT(0.098140130728100003)*f[30]; + tg = CONSTANT(0.162867503964999990)*g[2]+CONSTANT(-0.203550726872999990)*g[12]+CONSTANT(0.098140130728100003)*g[30]; + y[9] += tf*g[17]+tg*f[17]; + y[17] += tf*g[9]+tg*f[9]; + t = f[9]*g[17]+f[17]*g[9]; + y[2] += CONSTANT(0.162867503964999990)*t; + y[12] += CONSTANT(-0.203550726872999990)*t; + y[30] += CONSTANT(0.098140130728100003)*t; + + // [9,18]: 3,13,31,35, + tf = CONSTANT(-0.043528171377799997)*f[3]+CONSTANT(0.133255230519000010)*f[13]+CONSTANT(-0.101584686310000010)*f[31]+CONSTANT(0.098140130731999994)*f[35]; + tg = CONSTANT(-0.043528171377799997)*g[3]+CONSTANT(0.133255230519000010)*g[13]+CONSTANT(-0.101584686310000010)*g[31]+CONSTANT(0.098140130731999994)*g[35]; + y[9] += tf*g[18]+tg*f[18]; + y[18] += tf*g[9]+tg*f[9]; + t = f[9]*g[18]+f[18]*g[9]; + y[3] += CONSTANT(-0.043528171377799997)*t; + y[13] += CONSTANT(0.133255230519000010)*t; + y[31] += CONSTANT(-0.101584686310000010)*t; + y[35] += CONSTANT(0.098140130731999994)*t; + + // [9,19]: 14,32,34, + tf = CONSTANT(-0.099322584600699995)*f[14]+CONSTANT(0.126698363970000010)*f[32]+CONSTANT(0.131668802180999990)*f[34]; + tg = CONSTANT(-0.099322584600699995)*g[14]+CONSTANT(0.126698363970000010)*g[32]+CONSTANT(0.131668802180999990)*g[34]; + y[9] += tf*g[19]+tg*f[19]; + y[19] += tf*g[9]+tg*f[9]; + t = f[9]*g[19]+f[19]*g[9]; + y[14] += CONSTANT(-0.099322584600699995)*t; + y[32] += CONSTANT(0.126698363970000010)*t; + y[34] += CONSTANT(0.131668802180999990)*t; + + // [9,22]: 1,11,25,29, + tf = CONSTANT(-0.043528171378199997)*f[1]+CONSTANT(0.133255230518000010)*f[11]+CONSTANT(-0.098140130732499997)*f[25]+CONSTANT(-0.101584686311000000)*f[29]; + tg = CONSTANT(-0.043528171378199997)*g[1]+CONSTANT(0.133255230518000010)*g[11]+CONSTANT(-0.098140130732499997)*g[25]+CONSTANT(-0.101584686311000000)*g[29]; + y[9] += tf*g[22]+tg*f[22]; + y[22] += tf*g[9]+tg*f[9]; + t = f[9]*g[22]+f[22]*g[9]; + y[1] += CONSTANT(-0.043528171378199997)*t; + y[11] += CONSTANT(0.133255230518000010)*t; + y[25] += CONSTANT(-0.098140130732499997)*t; + y[29] += CONSTANT(-0.101584686311000000)*t; + + // [9,27]: 6,20, + tf = CONSTANT(0.126792179874999990)*f[6]+CONSTANT(-0.196280261464999990)*f[20]; + tg = CONSTANT(0.126792179874999990)*g[6]+CONSTANT(-0.196280261464999990)*g[20]; + y[9] += tf*g[27]+tg*f[27]; + y[27] += tf*g[9]+tg*f[9]; + t = f[9]*g[27]+f[27]*g[9]; + y[6] += CONSTANT(0.126792179874999990)*t; + y[20] += CONSTANT(-0.196280261464999990)*t; + + // [10,10]: 0,20,24, + tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(-0.151717754049000010)*f[24]; + tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(-0.151717754049000010)*g[24]; + y[10] += tf*g[10]+tg*f[10]; + t = f[10]*g[10]; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.179514867494000000)*t; + y[24] += CONSTANT(-0.151717754049000010)*t; + + // [10,16]: 14,32, + tf = CONSTANT(0.151717754044999990)*f[14]+CONSTANT(-0.077413979111300005)*f[32]; + tg = CONSTANT(0.151717754044999990)*g[14]+CONSTANT(-0.077413979111300005)*g[32]; + y[10] += tf*g[16]+tg*f[16]; + y[16] += tf*g[10]+tg*f[10]; + t = f[10]*g[16]+f[16]*g[10]; + y[14] += CONSTANT(0.151717754044999990)*t; + y[32] += CONSTANT(-0.077413979111300005)*t; + + // [10,17]: 13,3,31,35, + tf = CONSTANT(0.067850242288900006)*f[13]+CONSTANT(0.199471140200000010)*f[3]+CONSTANT(-0.113793659091000000)*f[31]+CONSTANT(-0.149911525925999990)*f[35]; + tg = CONSTANT(0.067850242288900006)*g[13]+CONSTANT(0.199471140200000010)*g[3]+CONSTANT(-0.113793659091000000)*g[31]+CONSTANT(-0.149911525925999990)*g[35]; + y[10] += tf*g[17]+tg*f[17]; + y[17] += tf*g[10]+tg*f[10]; + t = f[10]*g[17]+f[17]*g[10]; + y[13] += CONSTANT(0.067850242288900006)*t; + y[3] += CONSTANT(0.199471140200000010)*t; + y[31] += CONSTANT(-0.113793659091000000)*t; + y[35] += CONSTANT(-0.149911525925999990)*t; + + // [10,18]: 12,2,30,34, + tf = CONSTANT(-0.044418410173299998)*f[12]+CONSTANT(0.213243618621000000)*f[2]+CONSTANT(-0.171327458205000000)*f[30]+CONSTANT(-0.101358691177000000)*f[34]; + tg = CONSTANT(-0.044418410173299998)*g[12]+CONSTANT(0.213243618621000000)*g[2]+CONSTANT(-0.171327458205000000)*g[30]+CONSTANT(-0.101358691177000000)*g[34]; + y[10] += tf*g[18]+tg*f[18]; + y[18] += tf*g[10]+tg*f[10]; + t = f[10]*g[18]+f[18]*g[10]; + y[12] += CONSTANT(-0.044418410173299998)*t; + y[2] += CONSTANT(0.213243618621000000)*t; + y[30] += CONSTANT(-0.171327458205000000)*t; + y[34] += CONSTANT(-0.101358691177000000)*t; + + // [10,19]: 3,15,13,31,33, + tf = CONSTANT(-0.075393004386799994)*f[3]+CONSTANT(0.099322584599600000)*f[15]+CONSTANT(0.102579924281000000)*f[13]+CONSTANT(0.097749909976500002)*f[31]+CONSTANT(-0.025339672794100002)*f[33]; + tg = CONSTANT(-0.075393004386799994)*g[3]+CONSTANT(0.099322584599600000)*g[15]+CONSTANT(0.102579924281000000)*g[13]+CONSTANT(0.097749909976500002)*g[31]+CONSTANT(-0.025339672794100002)*g[33]; + y[10] += tf*g[19]+tg*f[19]; + y[19] += tf*g[10]+tg*f[10]; + t = f[10]*g[19]+f[19]*g[10]; + y[3] += CONSTANT(-0.075393004386799994)*t; + y[15] += CONSTANT(0.099322584599600000)*t; + y[13] += CONSTANT(0.102579924281000000)*t; + y[31] += CONSTANT(0.097749909976500002)*t; + y[33] += CONSTANT(-0.025339672794100002)*t; + + // [10,21]: 11,1,9,27,29, + tf = CONSTANT(0.102579924281000000)*f[11]+CONSTANT(-0.075393004386799994)*f[1]+CONSTANT(-0.099322584599600000)*f[9]+CONSTANT(0.025339672794100002)*f[27]+CONSTANT(0.097749909976500002)*f[29]; + tg = CONSTANT(0.102579924281000000)*g[11]+CONSTANT(-0.075393004386799994)*g[1]+CONSTANT(-0.099322584599600000)*g[9]+CONSTANT(0.025339672794100002)*g[27]+CONSTANT(0.097749909976500002)*g[29]; + y[10] += tf*g[21]+tg*f[21]; + y[21] += tf*g[10]+tg*f[10]; + t = f[10]*g[21]+f[21]*g[10]; + y[11] += CONSTANT(0.102579924281000000)*t; + y[1] += CONSTANT(-0.075393004386799994)*t; + y[9] += CONSTANT(-0.099322584599600000)*t; + y[27] += CONSTANT(0.025339672794100002)*t; + y[29] += CONSTANT(0.097749909976500002)*t; + + // [10,23]: 11,1,25,29, + tf = CONSTANT(-0.067850242288900006)*f[11]+CONSTANT(-0.199471140200000010)*f[1]+CONSTANT(0.149911525925999990)*f[25]+CONSTANT(0.113793659091000000)*f[29]; + tg = CONSTANT(-0.067850242288900006)*g[11]+CONSTANT(-0.199471140200000010)*g[1]+CONSTANT(0.149911525925999990)*g[25]+CONSTANT(0.113793659091000000)*g[29]; + y[10] += tf*g[23]+tg*f[23]; + y[23] += tf*g[10]+tg*f[10]; + t = f[10]*g[23]+f[23]*g[10]; + y[11] += CONSTANT(-0.067850242288900006)*t; + y[1] += CONSTANT(-0.199471140200000010)*t; + y[25] += CONSTANT(0.149911525925999990)*t; + y[29] += CONSTANT(0.113793659091000000)*t; + + // [10,28]: 6,20,24, + tf = CONSTANT(0.190188269814000000)*f[6]+CONSTANT(-0.065426753820500005)*f[20]+CONSTANT(0.077413979109600004)*f[24]; + tg = CONSTANT(0.190188269814000000)*g[6]+CONSTANT(-0.065426753820500005)*g[20]+CONSTANT(0.077413979109600004)*g[24]; + y[10] += tf*g[28]+tg*f[28]; + y[28] += tf*g[10]+tg*f[10]; + t = f[10]*g[28]+f[28]*g[10]; + y[6] += CONSTANT(0.190188269814000000)*t; + y[20] += CONSTANT(-0.065426753820500005)*t; + y[24] += CONSTANT(0.077413979109600004)*t; + + // [11,11]: 0,6,8,20,22, + tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(-0.145673124078999990)*f[8]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(-0.114687841910000000)*f[22]; + tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(-0.145673124078999990)*g[8]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(-0.114687841910000000)*g[22]; + y[11] += tf*g[11]+tg*f[11]; + t = f[11]*g[11]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[6] += CONSTANT(0.126156626101000010)*t; + y[8] += CONSTANT(-0.145673124078999990)*t; + y[20] += CONSTANT(0.025644981070299999)*t; + y[22] += CONSTANT(-0.114687841910000000)*t; + + // [11,16]: 15,33,35, + tf = CONSTANT(-0.117520066953000000)*f[15]+CONSTANT(0.119929220739999990)*f[33]+CONSTANT(0.134084945035999990)*f[35]; + tg = CONSTANT(-0.117520066953000000)*g[15]+CONSTANT(0.119929220739999990)*g[33]+CONSTANT(0.134084945035999990)*g[35]; + y[11] += tf*g[16]+tg*f[16]; + y[16] += tf*g[11]+tg*f[11]; + t = f[11]*g[16]+f[16]*g[11]; + y[15] += CONSTANT(-0.117520066953000000)*t; + y[33] += CONSTANT(0.119929220739999990)*t; + y[35] += CONSTANT(0.134084945035999990)*t; + + // [11,18]: 3,13,15,31,33, + tf = CONSTANT(0.168583882834000000)*f[3]+CONSTANT(0.114687841909000000)*f[13]+CONSTANT(-0.133255230519000010)*f[15]+CONSTANT(0.075189952564900006)*f[31]+CONSTANT(-0.101990215611000000)*f[33]; + tg = CONSTANT(0.168583882834000000)*g[3]+CONSTANT(0.114687841909000000)*g[13]+CONSTANT(-0.133255230519000010)*g[15]+CONSTANT(0.075189952564900006)*g[31]+CONSTANT(-0.101990215611000000)*g[33]; + y[11] += tf*g[18]+tg*f[18]; + y[18] += tf*g[11]+tg*f[11]; + t = f[11]*g[18]+f[18]*g[11]; + y[3] += CONSTANT(0.168583882834000000)*t; + y[13] += CONSTANT(0.114687841909000000)*t; + y[15] += CONSTANT(-0.133255230519000010)*t; + y[31] += CONSTANT(0.075189952564900006)*t; + y[33] += CONSTANT(-0.101990215611000000)*t; + + // [11,19]: 2,14,12,30,32, + tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(-0.102579924282000000)*f[14]+CONSTANT(0.099322584599300004)*f[12]+CONSTANT(0.009577496073830001)*f[30]+CONSTANT(-0.104682806112000000)*f[32]; + tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(-0.102579924282000000)*g[14]+CONSTANT(0.099322584599300004)*g[12]+CONSTANT(0.009577496073830001)*g[30]+CONSTANT(-0.104682806112000000)*g[32]; + y[11] += tf*g[19]+tg*f[19]; + y[19] += tf*g[11]+tg*f[11]; + t = f[11]*g[19]+f[19]*g[11]; + y[2] += CONSTANT(0.238413613504000000)*t; + y[14] += CONSTANT(-0.102579924282000000)*t; + y[12] += CONSTANT(0.099322584599300004)*t; + y[30] += CONSTANT(0.009577496073830001)*t; + y[32] += CONSTANT(-0.104682806112000000)*t; + + // [11,24]: 9,25,27, + tf = CONSTANT(0.117520066950999990)*f[9]+CONSTANT(-0.134084945037000000)*f[25]+CONSTANT(-0.119929220742000010)*f[27]; + tg = CONSTANT(0.117520066950999990)*g[9]+CONSTANT(-0.134084945037000000)*g[25]+CONSTANT(-0.119929220742000010)*g[27]; + y[11] += tf*g[24]+tg*f[24]; + y[24] += tf*g[11]+tg*f[11]; + t = f[11]*g[24]+f[24]*g[11]; + y[9] += CONSTANT(0.117520066950999990)*t; + y[25] += CONSTANT(-0.134084945037000000)*t; + y[27] += CONSTANT(-0.119929220742000010)*t; + + // [11,29]: 6,20,22,8, + tf = CONSTANT(0.227318461243000010)*f[6]+CONSTANT(0.086019920779800002)*f[20]+CONSTANT(-0.075189952565200002)*f[22]+CONSTANT(0.065621187395299999)*f[8]; + tg = CONSTANT(0.227318461243000010)*g[6]+CONSTANT(0.086019920779800002)*g[20]+CONSTANT(-0.075189952565200002)*g[22]+CONSTANT(0.065621187395299999)*g[8]; + y[11] += tf*g[29]+tg*f[29]; + y[29] += tf*g[11]+tg*f[11]; + t = f[11]*g[29]+f[29]*g[11]; + y[6] += CONSTANT(0.227318461243000010)*t; + y[20] += CONSTANT(0.086019920779800002)*t; + y[22] += CONSTANT(-0.075189952565200002)*t; + y[8] += CONSTANT(0.065621187395299999)*t; + + // [12,12]: 0,6,20, + tf = CONSTANT(0.282094799871999980)*f[0]+CONSTANT(0.168208852954000010)*f[6]+CONSTANT(0.153869910786000010)*f[20]; + tg = CONSTANT(0.282094799871999980)*g[0]+CONSTANT(0.168208852954000010)*g[6]+CONSTANT(0.153869910786000010)*g[20]; + y[12] += tf*g[12]+tg*f[12]; + t = f[12]*g[12]; + y[0] += CONSTANT(0.282094799871999980)*t; + y[6] += CONSTANT(0.168208852954000010)*t; + y[20] += CONSTANT(0.153869910786000010)*t; + + // [12,30]: 20,6, + tf = CONSTANT(0.148373961712999990)*f[20]+CONSTANT(0.239614719999000000)*f[6]; + tg = CONSTANT(0.148373961712999990)*g[20]+CONSTANT(0.239614719999000000)*g[6]; + y[12] += tf*g[30]+tg*f[30]; + y[30] += tf*g[12]+tg*f[12]; + t = f[12]*g[30]+f[30]*g[12]; + y[20] += CONSTANT(0.148373961712999990)*t; + y[6] += CONSTANT(0.239614719999000000)*t; + + // [13,13]: 0,8,6,20,22, + tf = CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.145673124078999990)*f[8]+CONSTANT(0.126156626101000010)*f[6]+CONSTANT(0.025644981070299999)*f[20]+CONSTANT(0.114687841910000000)*f[22]; + tg = CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.145673124078999990)*g[8]+CONSTANT(0.126156626101000010)*g[6]+CONSTANT(0.025644981070299999)*g[20]+CONSTANT(0.114687841910000000)*g[22]; + y[13] += tf*g[13]+tg*f[13]; + t = f[13]*g[13]; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.145673124078999990)*t; + y[6] += CONSTANT(0.126156626101000010)*t; + y[20] += CONSTANT(0.025644981070299999)*t; + y[22] += CONSTANT(0.114687841910000000)*t; + + // [13,16]: 9,25,27, + tf = CONSTANT(-0.117520066953000000)*f[9]+CONSTANT(-0.134084945035999990)*f[25]+CONSTANT(0.119929220739999990)*f[27]; + tg = CONSTANT(-0.117520066953000000)*g[9]+CONSTANT(-0.134084945035999990)*g[25]+CONSTANT(0.119929220739999990)*g[27]; + y[13] += tf*g[16]+tg*f[16]; + y[16] += tf*g[13]+tg*f[13]; + t = f[13]*g[16]+f[16]*g[13]; + y[9] += CONSTANT(-0.117520066953000000)*t; + y[25] += CONSTANT(-0.134084945035999990)*t; + y[27] += CONSTANT(0.119929220739999990)*t; + + // [13,21]: 2,12,14,30,32, + tf = CONSTANT(0.238413613504000000)*f[2]+CONSTANT(0.099322584599300004)*f[12]+CONSTANT(0.102579924282000000)*f[14]+CONSTANT(0.009577496073830001)*f[30]+CONSTANT(0.104682806112000000)*f[32]; + tg = CONSTANT(0.238413613504000000)*g[2]+CONSTANT(0.099322584599300004)*g[12]+CONSTANT(0.102579924282000000)*g[14]+CONSTANT(0.009577496073830001)*g[30]+CONSTANT(0.104682806112000000)*g[32]; + y[13] += tf*g[21]+tg*f[21]; + y[21] += tf*g[13]+tg*f[13]; + t = f[13]*g[21]+f[21]*g[13]; + y[2] += CONSTANT(0.238413613504000000)*t; + y[12] += CONSTANT(0.099322584599300004)*t; + y[14] += CONSTANT(0.102579924282000000)*t; + y[30] += CONSTANT(0.009577496073830001)*t; + y[32] += CONSTANT(0.104682806112000000)*t; + + // [13,24]: 15,33,35, + tf = CONSTANT(-0.117520066950999990)*f[15]+CONSTANT(0.119929220742000010)*f[33]+CONSTANT(-0.134084945037000000)*f[35]; + tg = CONSTANT(-0.117520066950999990)*g[15]+CONSTANT(0.119929220742000010)*g[33]+CONSTANT(-0.134084945037000000)*g[35]; + y[13] += tf*g[24]+tg*f[24]; + y[24] += tf*g[13]+tg*f[13]; + t = f[13]*g[24]+f[24]*g[13]; + y[15] += CONSTANT(-0.117520066950999990)*t; + y[33] += CONSTANT(0.119929220742000010)*t; + y[35] += CONSTANT(-0.134084945037000000)*t; + + // [13,31]: 6,22,20,8, + tf = CONSTANT(0.227318461243000010)*f[6]+CONSTANT(0.075189952565200002)*f[22]+CONSTANT(0.086019920779800002)*f[20]+CONSTANT(-0.065621187395299999)*f[8]; + tg = CONSTANT(0.227318461243000010)*g[6]+CONSTANT(0.075189952565200002)*g[22]+CONSTANT(0.086019920779800002)*g[20]+CONSTANT(-0.065621187395299999)*g[8]; + y[13] += tf*g[31]+tg*f[31]; + y[31] += tf*g[13]+tg*f[13]; + t = f[13]*g[31]+f[31]*g[13]; + y[6] += CONSTANT(0.227318461243000010)*t; + y[22] += CONSTANT(0.075189952565200002)*t; + y[20] += CONSTANT(0.086019920779800002)*t; + y[8] += CONSTANT(-0.065621187395299999)*t; + + // [14,14]: 0,20,24, + tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.179514867494000000)*f[20]+CONSTANT(0.151717754049000010)*f[24]; + tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.179514867494000000)*g[20]+CONSTANT(0.151717754049000010)*g[24]; + y[14] += tf*g[14]+tg*f[14]; + t = f[14]*g[14]; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.179514867494000000)*t; + y[24] += CONSTANT(0.151717754049000010)*t; + + // [14,17]: 11,1,25,29, + tf = CONSTANT(0.067850242288500007)*f[11]+CONSTANT(0.199471140196999990)*f[1]+CONSTANT(0.149911525925999990)*f[25]+CONSTANT(-0.113793659092000000)*f[29]; + tg = CONSTANT(0.067850242288500007)*g[11]+CONSTANT(0.199471140196999990)*g[1]+CONSTANT(0.149911525925999990)*g[25]+CONSTANT(-0.113793659092000000)*g[29]; + y[14] += tf*g[17]+tg*f[17]; + y[17] += tf*g[14]+tg*f[14]; + t = f[14]*g[17]+f[17]*g[14]; + y[11] += CONSTANT(0.067850242288500007)*t; + y[1] += CONSTANT(0.199471140196999990)*t; + y[25] += CONSTANT(0.149911525925999990)*t; + y[29] += CONSTANT(-0.113793659092000000)*t; + + // [14,22]: 12,2,30,34, + tf = CONSTANT(-0.044418410173299998)*f[12]+CONSTANT(0.213243618621000000)*f[2]+CONSTANT(-0.171327458205000000)*f[30]+CONSTANT(0.101358691177000000)*f[34]; + tg = CONSTANT(-0.044418410173299998)*g[12]+CONSTANT(0.213243618621000000)*g[2]+CONSTANT(-0.171327458205000000)*g[30]+CONSTANT(0.101358691177000000)*g[34]; + y[14] += tf*g[22]+tg*f[22]; + y[22] += tf*g[14]+tg*f[14]; + t = f[14]*g[22]+f[22]*g[14]; + y[12] += CONSTANT(-0.044418410173299998)*t; + y[2] += CONSTANT(0.213243618621000000)*t; + y[30] += CONSTANT(-0.171327458205000000)*t; + y[34] += CONSTANT(0.101358691177000000)*t; + + // [14,23]: 13,3,31,35, + tf = CONSTANT(0.067850242288500007)*f[13]+CONSTANT(0.199471140196999990)*f[3]+CONSTANT(-0.113793659092000000)*f[31]+CONSTANT(0.149911525925999990)*f[35]; + tg = CONSTANT(0.067850242288500007)*g[13]+CONSTANT(0.199471140196999990)*g[3]+CONSTANT(-0.113793659092000000)*g[31]+CONSTANT(0.149911525925999990)*g[35]; + y[14] += tf*g[23]+tg*f[23]; + y[23] += tf*g[14]+tg*f[14]; + t = f[14]*g[23]+f[23]*g[14]; + y[13] += CONSTANT(0.067850242288500007)*t; + y[3] += CONSTANT(0.199471140196999990)*t; + y[31] += CONSTANT(-0.113793659092000000)*t; + y[35] += CONSTANT(0.149911525925999990)*t; + + // [14,32]: 20,6,24, + tf = CONSTANT(-0.065426753820500005)*f[20]+CONSTANT(0.190188269814000000)*f[6]+CONSTANT(-0.077413979109600004)*f[24]; + tg = CONSTANT(-0.065426753820500005)*g[20]+CONSTANT(0.190188269814000000)*g[6]+CONSTANT(-0.077413979109600004)*g[24]; + y[14] += tf*g[32]+tg*f[32]; + y[32] += tf*g[14]+tg*f[14]; + t = f[14]*g[32]+f[32]*g[14]; + y[20] += CONSTANT(-0.065426753820500005)*t; + y[6] += CONSTANT(0.190188269814000000)*t; + y[24] += CONSTANT(-0.077413979109600004)*t; + + // [15,15]: 0,6,20, + tf = CONSTANT(0.282094791766999970)*f[0]+CONSTANT(-0.210261043508000010)*f[6]+CONSTANT(0.076934943209800002)*f[20]; + tg = CONSTANT(0.282094791766999970)*g[0]+CONSTANT(-0.210261043508000010)*g[6]+CONSTANT(0.076934943209800002)*g[20]; + y[15] += tf*g[15]+tg*f[15]; + t = f[15]*g[15]; + y[0] += CONSTANT(0.282094791766999970)*t; + y[6] += CONSTANT(-0.210261043508000010)*t; + y[20] += CONSTANT(0.076934943209800002)*t; + + // [15,21]: 14,32,34, + tf = CONSTANT(-0.099322584600699995)*f[14]+CONSTANT(0.126698363970000010)*f[32]+CONSTANT(-0.131668802180999990)*f[34]; + tg = CONSTANT(-0.099322584600699995)*g[14]+CONSTANT(0.126698363970000010)*g[32]+CONSTANT(-0.131668802180999990)*g[34]; + y[15] += tf*g[21]+tg*f[21]; + y[21] += tf*g[15]+tg*f[15]; + t = f[15]*g[21]+f[21]*g[15]; + y[14] += CONSTANT(-0.099322584600699995)*t; + y[32] += CONSTANT(0.126698363970000010)*t; + y[34] += CONSTANT(-0.131668802180999990)*t; + + // [15,22]: 13,3,31,35, + tf = CONSTANT(0.133255230518000010)*f[13]+CONSTANT(-0.043528171378199997)*f[3]+CONSTANT(-0.101584686311000000)*f[31]+CONSTANT(-0.098140130732499997)*f[35]; + tg = CONSTANT(0.133255230518000010)*g[13]+CONSTANT(-0.043528171378199997)*g[3]+CONSTANT(-0.101584686311000000)*g[31]+CONSTANT(-0.098140130732499997)*g[35]; + y[15] += tf*g[22]+tg*f[22]; + y[22] += tf*g[15]+tg*f[15]; + t = f[15]*g[22]+f[22]*g[15]; + y[13] += CONSTANT(0.133255230518000010)*t; + y[3] += CONSTANT(-0.043528171378199997)*t; + y[31] += CONSTANT(-0.101584686311000000)*t; + y[35] += CONSTANT(-0.098140130732499997)*t; + + // [15,23]: 12,2,30, + tf = CONSTANT(-0.203550726872999990)*f[12]+CONSTANT(0.162867503964999990)*f[2]+CONSTANT(0.098140130728100003)*f[30]; + tg = CONSTANT(-0.203550726872999990)*g[12]+CONSTANT(0.162867503964999990)*g[2]+CONSTANT(0.098140130728100003)*g[30]; + y[15] += tf*g[23]+tg*f[23]; + y[23] += tf*g[15]+tg*f[15]; + t = f[15]*g[23]+f[23]*g[15]; + y[12] += CONSTANT(-0.203550726872999990)*t; + y[2] += CONSTANT(0.162867503964999990)*t; + y[30] += CONSTANT(0.098140130728100003)*t; + + // [15,33]: 6,20, + tf = CONSTANT(0.126792179874999990)*f[6]+CONSTANT(-0.196280261464999990)*f[20]; + tg = CONSTANT(0.126792179874999990)*g[6]+CONSTANT(-0.196280261464999990)*g[20]; + y[15] += tf*g[33]+tg*f[33]; + y[33] += tf*g[15]+tg*f[15]; + t = f[15]*g[33]+f[33]*g[15]; + y[6] += CONSTANT(0.126792179874999990)*t; + y[20] += CONSTANT(-0.196280261464999990)*t; + + // [16,16]: 0,6,20, + tf = CONSTANT(0.282094791763999990)*f[0]+CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.106525305981000000)*f[20]; + tg = CONSTANT(0.282094791763999990)*g[0]+CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.106525305981000000)*g[20]; + y[16] += tf*g[16]+tg*f[16]; + t = f[16]*g[16]; + y[0] += CONSTANT(0.282094791763999990)*t; + y[6] += CONSTANT(-0.229375683829000000)*t; + y[20] += CONSTANT(0.106525305981000000)*t; + + // [16,18]: 8,22, + tf = CONSTANT(-0.075080816693699995)*f[8]+CONSTANT(0.135045473380000000)*f[22]; + tg = CONSTANT(-0.075080816693699995)*g[8]+CONSTANT(0.135045473380000000)*g[22]; + y[16] += tf*g[18]+tg*f[18]; + y[18] += tf*g[16]+tg*f[16]; + t = f[16]*g[18]+f[18]*g[16]; + y[8] += CONSTANT(-0.075080816693699995)*t; + y[22] += CONSTANT(0.135045473380000000)*t; + + // [16,23]: 19,5, + tf = CONSTANT(-0.119098912754999990)*f[19]+CONSTANT(0.140463346187999990)*f[5]; + tg = CONSTANT(-0.119098912754999990)*g[19]+CONSTANT(0.140463346187999990)*g[5]; + y[16] += tf*g[23]+tg*f[23]; + y[23] += tf*g[16]+tg*f[16]; + t = f[16]*g[23]+f[23]*g[16]; + y[19] += CONSTANT(-0.119098912754999990)*t; + y[5] += CONSTANT(0.140463346187999990)*t; + + // [16,26]: 12,2,30, + tf = CONSTANT(-0.207723503645000000)*f[12]+CONSTANT(0.147319200325000010)*f[2]+CONSTANT(0.130197596199999990)*f[30]; + tg = CONSTANT(-0.207723503645000000)*g[12]+CONSTANT(0.147319200325000010)*g[2]+CONSTANT(0.130197596199999990)*g[30]; + y[16] += tf*g[26]+tg*f[26]; + y[26] += tf*g[16]+tg*f[16]; + t = f[16]*g[26]+f[26]*g[16]; + y[12] += CONSTANT(-0.207723503645000000)*t; + y[2] += CONSTANT(0.147319200325000010)*t; + y[30] += CONSTANT(0.130197596199999990)*t; + + // [16,28]: 14,32, + tf = CONSTANT(-0.077413979111300005)*f[14]+CONSTANT(0.128376561115000010)*f[32]; + tg = CONSTANT(-0.077413979111300005)*g[14]+CONSTANT(0.128376561115000010)*g[32]; + y[16] += tf*g[28]+tg*f[28]; + y[28] += tf*g[16]+tg*f[16]; + t = f[16]*g[28]+f[28]*g[16]; + y[14] += CONSTANT(-0.077413979111300005)*t; + y[32] += CONSTANT(0.128376561115000010)*t; + + // [16,29]: 15,33,35, + tf = CONSTANT(0.035835708931099997)*f[15]+CONSTANT(-0.118853600623999990)*f[33]+CONSTANT(-0.053152946071899999)*f[35]; + tg = CONSTANT(0.035835708931099997)*g[15]+CONSTANT(-0.118853600623999990)*g[33]+CONSTANT(-0.053152946071899999)*g[35]; + y[16] += tf*g[29]+tg*f[29]; + y[29] += tf*g[16]+tg*f[16]; + t = f[16]*g[29]+f[29]*g[16]; + y[15] += CONSTANT(0.035835708931099997)*t; + y[33] += CONSTANT(-0.118853600623999990)*t; + y[35] += CONSTANT(-0.053152946071899999)*t; + + // [16,31]: 27,9,25, + tf = CONSTANT(-0.118853600623999990)*f[27]+CONSTANT(0.035835708931099997)*f[9]+CONSTANT(0.053152946071899999)*f[25]; + tg = CONSTANT(-0.118853600623999990)*g[27]+CONSTANT(0.035835708931099997)*g[9]+CONSTANT(0.053152946071899999)*g[25]; + y[16] += tf*g[31]+tg*f[31]; + y[31] += tf*g[16]+tg*f[16]; + t = f[16]*g[31]+f[31]*g[16]; + y[27] += CONSTANT(-0.118853600623999990)*t; + y[9] += CONSTANT(0.035835708931099997)*t; + y[25] += CONSTANT(0.053152946071899999)*t; + + // [17,17]: 0,6,20, + tf = CONSTANT(0.282094791768999990)*f[0]+CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20]; + tg = CONSTANT(0.282094791768999990)*g[0]+CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20]; + y[17] += tf*g[17]+tg*f[17]; + t = f[17]*g[17]; + y[0] += CONSTANT(0.282094791768999990)*t; + y[6] += CONSTANT(-0.057343920955899998)*t; + y[20] += CONSTANT(-0.159787958979000000)*t; + + // [17,19]: 8,22,24, + tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(0.119098912753000000)*f[24]; + tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(0.119098912753000000)*g[24]; + y[17] += tf*g[19]+tg*f[19]; + y[19] += tf*g[17]+tg*f[17]; + t = f[17]*g[19]+f[19]*g[17]; + y[8] += CONSTANT(-0.112621225039000000)*t; + y[22] += CONSTANT(0.045015157794100001)*t; + y[24] += CONSTANT(0.119098912753000000)*t; + + // [17,21]: 16,4,18, + tf = CONSTANT(-0.119098912754999990)*f[16]+CONSTANT(-0.112621225039000000)*f[4]+CONSTANT(0.045015157794399997)*f[18]; + tg = CONSTANT(-0.119098912754999990)*g[16]+CONSTANT(-0.112621225039000000)*g[4]+CONSTANT(0.045015157794399997)*g[18]; + y[17] += tf*g[21]+tg*f[21]; + y[21] += tf*g[17]+tg*f[17]; + t = f[17]*g[21]+f[21]*g[17]; + y[16] += CONSTANT(-0.119098912754999990)*t; + y[4] += CONSTANT(-0.112621225039000000)*t; + y[18] += CONSTANT(0.045015157794399997)*t; + + // [17,26]: 3,13,31, + tf = CONSTANT(0.208340811096000000)*f[3]+CONSTANT(0.029982305185199998)*f[13]+CONSTANT(-0.118853600623999990)*f[31]; + tg = CONSTANT(0.208340811096000000)*g[3]+CONSTANT(0.029982305185199998)*g[13]+CONSTANT(-0.118853600623999990)*g[31]; + y[17] += tf*g[26]+tg*f[26]; + y[26] += tf*g[17]+tg*f[17]; + t = f[17]*g[26]+f[26]*g[17]; + y[3] += CONSTANT(0.208340811096000000)*t; + y[13] += CONSTANT(0.029982305185199998)*t; + y[31] += CONSTANT(-0.118853600623999990)*t; + + // [17,27]: 12,2,30, + tf = CONSTANT(-0.103861751821000010)*f[12]+CONSTANT(0.196425600433000000)*f[2]+CONSTANT(-0.130197596204999990)*f[30]; + tg = CONSTANT(-0.103861751821000010)*g[12]+CONSTANT(0.196425600433000000)*g[2]+CONSTANT(-0.130197596204999990)*g[30]; + y[17] += tf*g[27]+tg*f[27]; + y[27] += tf*g[17]+tg*f[17]; + t = f[17]*g[27]+f[27]*g[17]; + y[12] += CONSTANT(-0.103861751821000010)*t; + y[2] += CONSTANT(0.196425600433000000)*t; + y[30] += CONSTANT(-0.130197596204999990)*t; + + // [17,28]: 13,3,31,35, + tf = CONSTANT(0.121172043789000000)*f[13]+CONSTANT(-0.060142811686500000)*f[3]+CONSTANT(0.034310079156700000)*f[31]+CONSTANT(0.099440056652200001)*f[35]; + tg = CONSTANT(0.121172043789000000)*g[13]+CONSTANT(-0.060142811686500000)*g[3]+CONSTANT(0.034310079156700000)*g[31]+CONSTANT(0.099440056652200001)*g[35]; + y[17] += tf*g[28]+tg*f[28]; + y[28] += tf*g[17]+tg*f[17]; + t = f[17]*g[28]+f[28]*g[17]; + y[13] += CONSTANT(0.121172043789000000)*t; + y[3] += CONSTANT(-0.060142811686500000)*t; + y[31] += CONSTANT(0.034310079156700000)*t; + y[35] += CONSTANT(0.099440056652200001)*t; + + // [17,32]: 11,1,25,29, + tf = CONSTANT(0.121172043788000010)*f[11]+CONSTANT(-0.060142811686900000)*f[1]+CONSTANT(-0.099440056652700004)*f[25]+CONSTANT(0.034310079156599997)*f[29]; + tg = CONSTANT(0.121172043788000010)*g[11]+CONSTANT(-0.060142811686900000)*g[1]+CONSTANT(-0.099440056652700004)*g[25]+CONSTANT(0.034310079156599997)*g[29]; + y[17] += tf*g[32]+tg*f[32]; + y[32] += tf*g[17]+tg*f[17]; + t = f[17]*g[32]+f[32]*g[17]; + y[11] += CONSTANT(0.121172043788000010)*t; + y[1] += CONSTANT(-0.060142811686900000)*t; + y[25] += CONSTANT(-0.099440056652700004)*t; + y[29] += CONSTANT(0.034310079156599997)*t; + + // [17,34]: 29,11,1, + tf = CONSTANT(0.118853600623000000)*f[29]+CONSTANT(-0.029982305185400002)*f[11]+CONSTANT(-0.208340811100000000)*f[1]; + tg = CONSTANT(0.118853600623000000)*g[29]+CONSTANT(-0.029982305185400002)*g[11]+CONSTANT(-0.208340811100000000)*g[1]; + y[17] += tf*g[34]+tg*f[34]; + y[34] += tf*g[17]+tg*f[17]; + t = f[17]*g[34]+f[34]*g[17]; + y[29] += CONSTANT(0.118853600623000000)*t; + y[11] += CONSTANT(-0.029982305185400002)*t; + y[1] += CONSTANT(-0.208340811100000000)*t; + + // [18,18]: 6,0,20,24, + tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(-0.135045473384000000)*f[24]; + tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(-0.135045473384000000)*g[24]; + y[18] += tf*g[18]+tg*f[18]; + t = f[18]*g[18]; + y[6] += CONSTANT(0.065535909662600006)*t; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.083698454702400005)*t; + y[24] += CONSTANT(-0.135045473384000000)*t; + + // [18,19]: 7,21,23, + tf = CONSTANT(0.090297865407399994)*f[7]+CONSTANT(0.102084782359000000)*f[21]+CONSTANT(-0.045015157794399997)*f[23]; + tg = CONSTANT(0.090297865407399994)*g[7]+CONSTANT(0.102084782359000000)*g[21]+CONSTANT(-0.045015157794399997)*g[23]; + y[18] += tf*g[19]+tg*f[19]; + y[19] += tf*g[18]+tg*f[18]; + t = f[18]*g[19]+f[19]*g[18]; + y[7] += CONSTANT(0.090297865407399994)*t; + y[21] += CONSTANT(0.102084782359000000)*t; + y[23] += CONSTANT(-0.045015157794399997)*t; + + // [18,25]: 15,33, + tf = CONSTANT(-0.098140130731999994)*f[15]+CONSTANT(0.130197596202000000)*f[33]; + tg = CONSTANT(-0.098140130731999994)*g[15]+CONSTANT(0.130197596202000000)*g[33]; + y[18] += tf*g[25]+tg*f[25]; + y[25] += tf*g[18]+tg*f[18]; + t = f[18]*g[25]+f[25]*g[18]; + y[15] += CONSTANT(-0.098140130731999994)*t; + y[33] += CONSTANT(0.130197596202000000)*t; + + // [18,26]: 14,32, + tf = CONSTANT(0.101358691174000000)*f[14]+CONSTANT(0.084042186965900004)*f[32]; + tg = CONSTANT(0.101358691174000000)*g[14]+CONSTANT(0.084042186965900004)*g[32]; + y[18] += tf*g[26]+tg*f[26]; + y[26] += tf*g[18]+tg*f[18]; + t = f[18]*g[26]+f[26]*g[18]; + y[14] += CONSTANT(0.101358691174000000)*t; + y[32] += CONSTANT(0.084042186965900004)*t; + + // [18,27]: 13,3,35, + tf = CONSTANT(0.101990215611000000)*f[13]+CONSTANT(0.183739324705999990)*f[3]+CONSTANT(-0.130197596202000000)*f[35]; + tg = CONSTANT(0.101990215611000000)*g[13]+CONSTANT(0.183739324705999990)*g[3]+CONSTANT(-0.130197596202000000)*g[35]; + y[18] += tf*g[27]+tg*f[27]; + y[27] += tf*g[18]+tg*f[18]; + t = f[18]*g[27]+f[27]*g[18]; + y[13] += CONSTANT(0.101990215611000000)*t; + y[3] += CONSTANT(0.183739324705999990)*t; + y[35] += CONSTANT(-0.130197596202000000)*t; + + // [18,28]: 2,12,30,34, + tf = CONSTANT(0.225033795606000010)*f[2]+CONSTANT(0.022664492358099999)*f[12]+CONSTANT(-0.099440056651100006)*f[30]+CONSTANT(-0.084042186968800003)*f[34]; + tg = CONSTANT(0.225033795606000010)*g[2]+CONSTANT(0.022664492358099999)*g[12]+CONSTANT(-0.099440056651100006)*g[30]+CONSTANT(-0.084042186968800003)*g[34]; + y[18] += tf*g[28]+tg*f[28]; + y[28] += tf*g[18]+tg*f[18]; + t = f[18]*g[28]+f[28]*g[18]; + y[2] += CONSTANT(0.225033795606000010)*t; + y[12] += CONSTANT(0.022664492358099999)*t; + y[30] += CONSTANT(-0.099440056651100006)*t; + y[34] += CONSTANT(-0.084042186968800003)*t; + + // [18,29]: 3,13,15,31, + tf = CONSTANT(-0.085054779966799998)*f[3]+CONSTANT(0.075189952564900006)*f[13]+CONSTANT(0.101584686310000010)*f[15]+CONSTANT(0.097043558538999999)*f[31]; + tg = CONSTANT(-0.085054779966799998)*g[3]+CONSTANT(0.075189952564900006)*g[13]+CONSTANT(0.101584686310000010)*g[15]+CONSTANT(0.097043558538999999)*g[31]; + y[18] += tf*g[29]+tg*f[29]; + y[29] += tf*g[18]+tg*f[18]; + t = f[18]*g[29]+f[29]*g[18]; + y[3] += CONSTANT(-0.085054779966799998)*t; + y[13] += CONSTANT(0.075189952564900006)*t; + y[15] += CONSTANT(0.101584686310000010)*t; + y[31] += CONSTANT(0.097043558538999999)*t; + + // [19,19]: 6,8,0,20,22, + tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(-0.141889406570999990)*f[8]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(-0.102084782360000000)*f[22]; + tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(-0.141889406570999990)*g[8]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(-0.102084782360000000)*g[22]; + y[19] += tf*g[19]+tg*f[19]; + t = f[19]*g[19]; + y[6] += CONSTANT(0.139263808033999990)*t; + y[8] += CONSTANT(-0.141889406570999990)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[20] += CONSTANT(0.068480553847200004)*t; + y[22] += CONSTANT(-0.102084782360000000)*t; + + // [19,25]: 34, + tf = CONSTANT(-0.130197596205999990)*f[34]; + tg = CONSTANT(-0.130197596205999990)*g[34]; + y[19] += tf*g[25]+tg*f[25]; + y[25] += tf*g[19]+tg*f[19]; + t = f[19]*g[25]+f[25]*g[19]; + y[34] += CONSTANT(-0.130197596205999990)*t; + + // [19,26]: 15,35, + tf = CONSTANT(-0.131668802182000000)*f[15]+CONSTANT(0.130197596204999990)*f[35]; + tg = CONSTANT(-0.131668802182000000)*g[15]+CONSTANT(0.130197596204999990)*g[35]; + y[19] += tf*g[26]+tg*f[26]; + y[26] += tf*g[19]+tg*f[19]; + t = f[19]*g[26]+f[26]*g[19]; + y[15] += CONSTANT(-0.131668802182000000)*t; + y[35] += CONSTANT(0.130197596204999990)*t; + + // [19,27]: 14,32, + tf = CONSTANT(0.025339672793899998)*f[14]+CONSTANT(0.084042186967699994)*f[32]; + tg = CONSTANT(0.025339672793899998)*g[14]+CONSTANT(0.084042186967699994)*g[32]; + y[19] += tf*g[27]+tg*f[27]; + y[27] += tf*g[19]+tg*f[19]; + t = f[19]*g[27]+f[27]*g[19]; + y[14] += CONSTANT(0.025339672793899998)*t; + y[32] += CONSTANT(0.084042186967699994)*t; + + // [19,28]: 13,3,15,31,33, + tf = CONSTANT(0.104682806111000000)*f[13]+CONSTANT(0.159122922869999990)*f[3]+CONSTANT(-0.126698363970000010)*f[15]+CONSTANT(0.090775936911399999)*f[31]+CONSTANT(-0.084042186968400004)*f[33]; + tg = CONSTANT(0.104682806111000000)*g[13]+CONSTANT(0.159122922869999990)*g[3]+CONSTANT(-0.126698363970000010)*g[15]+CONSTANT(0.090775936911399999)*g[31]+CONSTANT(-0.084042186968400004)*g[33]; + y[19] += tf*g[28]+tg*f[28]; + y[28] += tf*g[19]+tg*f[19]; + t = f[19]*g[28]+f[28]*g[19]; + y[13] += CONSTANT(0.104682806111000000)*t; + y[3] += CONSTANT(0.159122922869999990)*t; + y[15] += CONSTANT(-0.126698363970000010)*t; + y[31] += CONSTANT(0.090775936911399999)*t; + y[33] += CONSTANT(-0.084042186968400004)*t; + + // [19,29]: 12,14,2,30,32, + tf = CONSTANT(0.115089467124000010)*f[12]+CONSTANT(-0.097749909977199997)*f[14]+CONSTANT(0.240571246744999990)*f[2]+CONSTANT(0.053152946072499999)*f[30]+CONSTANT(-0.090775936912099994)*f[32]; + tg = CONSTANT(0.115089467124000010)*g[12]+CONSTANT(-0.097749909977199997)*g[14]+CONSTANT(0.240571246744999990)*g[2]+CONSTANT(0.053152946072499999)*g[30]+CONSTANT(-0.090775936912099994)*g[32]; + y[19] += tf*g[29]+tg*f[29]; + y[29] += tf*g[19]+tg*f[19]; + t = f[19]*g[29]+f[29]*g[19]; + y[12] += CONSTANT(0.115089467124000010)*t; + y[14] += CONSTANT(-0.097749909977199997)*t; + y[2] += CONSTANT(0.240571246744999990)*t; + y[30] += CONSTANT(0.053152946072499999)*t; + y[32] += CONSTANT(-0.090775936912099994)*t; + + // [20,20]: 6,0,20, + tf = CONSTANT(0.163839797503000010)*f[6]+CONSTANT(0.282094802232000010)*f[0]; + tg = CONSTANT(0.163839797503000010)*g[6]+CONSTANT(0.282094802232000010)*g[0]; + y[20] += tf*g[20]+tg*f[20]; + t = f[20]*g[20]; + y[6] += CONSTANT(0.163839797503000010)*t; + y[0] += CONSTANT(0.282094802232000010)*t; + y[20] += CONSTANT(0.136961139005999990)*t; + + // [21,21]: 6,20,0,8,22, + tf = CONSTANT(0.139263808033999990)*f[6]+CONSTANT(0.068480553847200004)*f[20]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(0.141889406570999990)*f[8]+CONSTANT(0.102084782360000000)*f[22]; + tg = CONSTANT(0.139263808033999990)*g[6]+CONSTANT(0.068480553847200004)*g[20]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(0.141889406570999990)*g[8]+CONSTANT(0.102084782360000000)*g[22]; + y[21] += tf*g[21]+tg*f[21]; + t = f[21]*g[21]; + y[6] += CONSTANT(0.139263808033999990)*t; + y[20] += CONSTANT(0.068480553847200004)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[8] += CONSTANT(0.141889406570999990)*t; + y[22] += CONSTANT(0.102084782360000000)*t; + + // [21,23]: 8,22,24, + tf = CONSTANT(-0.112621225039000000)*f[8]+CONSTANT(0.045015157794100001)*f[22]+CONSTANT(-0.119098912753000000)*f[24]; + tg = CONSTANT(-0.112621225039000000)*g[8]+CONSTANT(0.045015157794100001)*g[22]+CONSTANT(-0.119098912753000000)*g[24]; + y[21] += tf*g[23]+tg*f[23]; + y[23] += tf*g[21]+tg*f[21]; + t = f[21]*g[23]+f[23]*g[21]; + y[8] += CONSTANT(-0.112621225039000000)*t; + y[22] += CONSTANT(0.045015157794100001)*t; + y[24] += CONSTANT(-0.119098912753000000)*t; + + // [21,26]: 9,25, + tf = CONSTANT(-0.131668802182000000)*f[9]+CONSTANT(-0.130197596204999990)*f[25]; + tg = CONSTANT(-0.131668802182000000)*g[9]+CONSTANT(-0.130197596204999990)*g[25]; + y[21] += tf*g[26]+tg*f[26]; + y[26] += tf*g[21]+tg*f[21]; + t = f[21]*g[26]+f[26]*g[21]; + y[9] += CONSTANT(-0.131668802182000000)*t; + y[25] += CONSTANT(-0.130197596204999990)*t; + + // [21,28]: 27,1,11,9,29, + tf = CONSTANT(0.084042186968400004)*f[27]+CONSTANT(0.159122922869999990)*f[1]+CONSTANT(0.104682806111000000)*f[11]+CONSTANT(0.126698363970000010)*f[9]+CONSTANT(0.090775936911399999)*f[29]; + tg = CONSTANT(0.084042186968400004)*g[27]+CONSTANT(0.159122922869999990)*g[1]+CONSTANT(0.104682806111000000)*g[11]+CONSTANT(0.126698363970000010)*g[9]+CONSTANT(0.090775936911399999)*g[29]; + y[21] += tf*g[28]+tg*f[28]; + y[28] += tf*g[21]+tg*f[21]; + t = f[21]*g[28]+f[28]*g[21]; + y[27] += CONSTANT(0.084042186968400004)*t; + y[1] += CONSTANT(0.159122922869999990)*t; + y[11] += CONSTANT(0.104682806111000000)*t; + y[9] += CONSTANT(0.126698363970000010)*t; + y[29] += CONSTANT(0.090775936911399999)*t; + + // [21,31]: 14,2,30,12,32, + tf = CONSTANT(0.097749909977199997)*f[14]+CONSTANT(0.240571246744999990)*f[2]+CONSTANT(0.053152946072499999)*f[30]+CONSTANT(0.115089467124000010)*f[12]+CONSTANT(0.090775936912099994)*f[32]; + tg = CONSTANT(0.097749909977199997)*g[14]+CONSTANT(0.240571246744999990)*g[2]+CONSTANT(0.053152946072499999)*g[30]+CONSTANT(0.115089467124000010)*g[12]+CONSTANT(0.090775936912099994)*g[32]; + y[21] += tf*g[31]+tg*f[31]; + y[31] += tf*g[21]+tg*f[21]; + t = f[21]*g[31]+f[31]*g[21]; + y[14] += CONSTANT(0.097749909977199997)*t; + y[2] += CONSTANT(0.240571246744999990)*t; + y[30] += CONSTANT(0.053152946072499999)*t; + y[12] += CONSTANT(0.115089467124000010)*t; + y[32] += CONSTANT(0.090775936912099994)*t; + + // [21,33]: 32,14, + tf = CONSTANT(0.084042186967699994)*f[32]+CONSTANT(0.025339672793899998)*f[14]; + tg = CONSTANT(0.084042186967699994)*g[32]+CONSTANT(0.025339672793899998)*g[14]; + y[21] += tf*g[33]+tg*f[33]; + y[33] += tf*g[21]+tg*f[21]; + t = f[21]*g[33]+f[33]*g[21]; + y[32] += CONSTANT(0.084042186967699994)*t; + y[14] += CONSTANT(0.025339672793899998)*t; + + // [21,34]: 35, + tf = CONSTANT(-0.130197596205999990)*f[35]; + tg = CONSTANT(-0.130197596205999990)*g[35]; + y[21] += tf*g[34]+tg*f[34]; + y[34] += tf*g[21]+tg*f[21]; + t = f[21]*g[34]+f[34]*g[21]; + y[35] += CONSTANT(-0.130197596205999990)*t; + + // [22,22]: 6,20,0,24, + tf = CONSTANT(0.065535909662600006)*f[6]+CONSTANT(-0.083698454702400005)*f[20]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(0.135045473384000000)*f[24]; + tg = CONSTANT(0.065535909662600006)*g[6]+CONSTANT(-0.083698454702400005)*g[20]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(0.135045473384000000)*g[24]; + y[22] += tf*g[22]+tg*f[22]; + t = f[22]*g[22]; + y[6] += CONSTANT(0.065535909662600006)*t; + y[20] += CONSTANT(-0.083698454702400005)*t; + y[0] += CONSTANT(0.282094791771999980)*t; + y[24] += CONSTANT(0.135045473384000000)*t; + + // [22,26]: 10,28, + tf = CONSTANT(0.101358691174000000)*f[10]+CONSTANT(0.084042186965900004)*f[28]; + tg = CONSTANT(0.101358691174000000)*g[10]+CONSTANT(0.084042186965900004)*g[28]; + y[22] += tf*g[26]+tg*f[26]; + y[26] += tf*g[22]+tg*f[22]; + t = f[22]*g[26]+f[26]*g[22]; + y[10] += CONSTANT(0.101358691174000000)*t; + y[28] += CONSTANT(0.084042186965900004)*t; + + // [22,27]: 1,11,25, + tf = CONSTANT(0.183739324704000010)*f[1]+CONSTANT(0.101990215611000000)*f[11]+CONSTANT(0.130197596200999990)*f[25]; + tg = CONSTANT(0.183739324704000010)*g[1]+CONSTANT(0.101990215611000000)*g[11]+CONSTANT(0.130197596200999990)*g[25]; + y[22] += tf*g[27]+tg*f[27]; + y[27] += tf*g[22]+tg*f[22]; + t = f[22]*g[27]+f[27]*g[22]; + y[1] += CONSTANT(0.183739324704000010)*t; + y[11] += CONSTANT(0.101990215611000000)*t; + y[25] += CONSTANT(0.130197596200999990)*t; + + // [22,32]: 2,30,12,34, + tf = CONSTANT(0.225033795606000010)*f[2]+CONSTANT(-0.099440056651100006)*f[30]+CONSTANT(0.022664492358099999)*f[12]+CONSTANT(0.084042186968800003)*f[34]; + tg = CONSTANT(0.225033795606000010)*g[2]+CONSTANT(-0.099440056651100006)*g[30]+CONSTANT(0.022664492358099999)*g[12]+CONSTANT(0.084042186968800003)*g[34]; + y[22] += tf*g[32]+tg*f[32]; + y[32] += tf*g[22]+tg*f[22]; + t = f[22]*g[32]+f[32]*g[22]; + y[2] += CONSTANT(0.225033795606000010)*t; + y[30] += CONSTANT(-0.099440056651100006)*t; + y[12] += CONSTANT(0.022664492358099999)*t; + y[34] += CONSTANT(0.084042186968800003)*t; + + // [22,33]: 3,13,35, + tf = CONSTANT(0.183739324704000010)*f[3]+CONSTANT(0.101990215611000000)*f[13]+CONSTANT(0.130197596200999990)*f[35]; + tg = CONSTANT(0.183739324704000010)*g[3]+CONSTANT(0.101990215611000000)*g[13]+CONSTANT(0.130197596200999990)*g[35]; + y[22] += tf*g[33]+tg*f[33]; + y[33] += tf*g[22]+tg*f[22]; + t = f[22]*g[33]+f[33]*g[22]; + y[3] += CONSTANT(0.183739324704000010)*t; + y[13] += CONSTANT(0.101990215611000000)*t; + y[35] += CONSTANT(0.130197596200999990)*t; + + // [23,23]: 6,20,0, + tf = CONSTANT(-0.057343920955899998)*f[6]+CONSTANT(-0.159787958979000000)*f[20]+CONSTANT(0.282094791768999990)*f[0]; + tg = CONSTANT(-0.057343920955899998)*g[6]+CONSTANT(-0.159787958979000000)*g[20]+CONSTANT(0.282094791768999990)*g[0]; + y[23] += tf*g[23]+tg*f[23]; + t = f[23]*g[23]; + y[6] += CONSTANT(-0.057343920955899998)*t; + y[20] += CONSTANT(-0.159787958979000000)*t; + y[0] += CONSTANT(0.282094791768999990)*t; + + // [23,26]: 1,11,29, + tf = CONSTANT(0.208340811096000000)*f[1]+CONSTANT(0.029982305185199998)*f[11]+CONSTANT(-0.118853600623999990)*f[29]; + tg = CONSTANT(0.208340811096000000)*g[1]+CONSTANT(0.029982305185199998)*g[11]+CONSTANT(-0.118853600623999990)*g[29]; + y[23] += tf*g[26]+tg*f[26]; + y[26] += tf*g[23]+tg*f[23]; + t = f[23]*g[26]+f[26]*g[23]; + y[1] += CONSTANT(0.208340811096000000)*t; + y[11] += CONSTANT(0.029982305185199998)*t; + y[29] += CONSTANT(-0.118853600623999990)*t; + + // [23,28]: 25,11,1,29, + tf = CONSTANT(-0.099440056652200001)*f[25]+CONSTANT(-0.121172043789000000)*f[11]+CONSTANT(0.060142811686500000)*f[1]+CONSTANT(-0.034310079156700000)*f[29]; + tg = CONSTANT(-0.099440056652200001)*g[25]+CONSTANT(-0.121172043789000000)*g[11]+CONSTANT(0.060142811686500000)*g[1]+CONSTANT(-0.034310079156700000)*g[29]; + y[23] += tf*g[28]+tg*f[28]; + y[28] += tf*g[23]+tg*f[23]; + t = f[23]*g[28]+f[28]*g[23]; + y[25] += CONSTANT(-0.099440056652200001)*t; + y[11] += CONSTANT(-0.121172043789000000)*t; + y[1] += CONSTANT(0.060142811686500000)*t; + y[29] += CONSTANT(-0.034310079156700000)*t; + + // [23,32]: 31,13,3,35, + tf = CONSTANT(0.034310079156599997)*f[31]+CONSTANT(0.121172043788000010)*f[13]+CONSTANT(-0.060142811686900000)*f[3]+CONSTANT(-0.099440056652700004)*f[35]; + tg = CONSTANT(0.034310079156599997)*g[31]+CONSTANT(0.121172043788000010)*g[13]+CONSTANT(-0.060142811686900000)*g[3]+CONSTANT(-0.099440056652700004)*g[35]; + y[23] += tf*g[32]+tg*f[32]; + y[32] += tf*g[23]+tg*f[23]; + t = f[23]*g[32]+f[32]*g[23]; + y[31] += CONSTANT(0.034310079156599997)*t; + y[13] += CONSTANT(0.121172043788000010)*t; + y[3] += CONSTANT(-0.060142811686900000)*t; + y[35] += CONSTANT(-0.099440056652700004)*t; + + // [23,33]: 2,30,12, + tf = CONSTANT(0.196425600433000000)*f[2]+CONSTANT(-0.130197596204999990)*f[30]+CONSTANT(-0.103861751821000010)*f[12]; + tg = CONSTANT(0.196425600433000000)*g[2]+CONSTANT(-0.130197596204999990)*g[30]+CONSTANT(-0.103861751821000010)*g[12]; + y[23] += tf*g[33]+tg*f[33]; + y[33] += tf*g[23]+tg*f[23]; + t = f[23]*g[33]+f[33]*g[23]; + y[2] += CONSTANT(0.196425600433000000)*t; + y[30] += CONSTANT(-0.130197596204999990)*t; + y[12] += CONSTANT(-0.103861751821000010)*t; + + // [23,34]: 3,13,31, + tf = CONSTANT(0.208340811100000000)*f[3]+CONSTANT(0.029982305185400002)*f[13]+CONSTANT(-0.118853600623000000)*f[31]; + tg = CONSTANT(0.208340811100000000)*g[3]+CONSTANT(0.029982305185400002)*g[13]+CONSTANT(-0.118853600623000000)*g[31]; + y[23] += tf*g[34]+tg*f[34]; + y[34] += tf*g[23]+tg*f[23]; + t = f[23]*g[34]+f[34]*g[23]; + y[3] += CONSTANT(0.208340811100000000)*t; + y[13] += CONSTANT(0.029982305185400002)*t; + y[31] += CONSTANT(-0.118853600623000000)*t; + + // [24,24]: 6,0,20, + tf = CONSTANT(-0.229375683829000000)*f[6]+CONSTANT(0.282094791763999990)*f[0]+CONSTANT(0.106525305981000000)*f[20]; + tg = CONSTANT(-0.229375683829000000)*g[6]+CONSTANT(0.282094791763999990)*g[0]+CONSTANT(0.106525305981000000)*g[20]; + y[24] += tf*g[24]+tg*f[24]; + t = f[24]*g[24]; + y[6] += CONSTANT(-0.229375683829000000)*t; + y[0] += CONSTANT(0.282094791763999990)*t; + y[20] += CONSTANT(0.106525305981000000)*t; + + // [24,29]: 9,27,25, + tf = CONSTANT(-0.035835708931400000)*f[9]+CONSTANT(0.118853600623000000)*f[27]+CONSTANT(0.053152946071199997)*f[25]; + tg = CONSTANT(-0.035835708931400000)*g[9]+CONSTANT(0.118853600623000000)*g[27]+CONSTANT(0.053152946071199997)*g[25]; + y[24] += tf*g[29]+tg*f[29]; + y[29] += tf*g[24]+tg*f[24]; + t = f[24]*g[29]+f[29]*g[24]; + y[9] += CONSTANT(-0.035835708931400000)*t; + y[27] += CONSTANT(0.118853600623000000)*t; + y[25] += CONSTANT(0.053152946071199997)*t; + + // [24,31]: 15,33,35, + tf = CONSTANT(0.035835708931400000)*f[15]+CONSTANT(-0.118853600623000000)*f[33]+CONSTANT(0.053152946071199997)*f[35]; + tg = CONSTANT(0.035835708931400000)*g[15]+CONSTANT(-0.118853600623000000)*g[33]+CONSTANT(0.053152946071199997)*g[35]; + y[24] += tf*g[31]+tg*f[31]; + y[31] += tf*g[24]+tg*f[24]; + t = f[24]*g[31]+f[31]*g[24]; + y[15] += CONSTANT(0.035835708931400000)*t; + y[33] += CONSTANT(-0.118853600623000000)*t; + y[35] += CONSTANT(0.053152946071199997)*t; + + // [24,34]: 12,30,2, + tf = CONSTANT(-0.207723503645000000)*f[12]+CONSTANT(0.130197596199999990)*f[30]+CONSTANT(0.147319200325000010)*f[2]; + tg = CONSTANT(-0.207723503645000000)*g[12]+CONSTANT(0.130197596199999990)*g[30]+CONSTANT(0.147319200325000010)*g[2]; + y[24] += tf*g[34]+tg*f[34]; + y[34] += tf*g[24]+tg*f[24]; + t = f[24]*g[34]+f[34]*g[24]; + y[12] += CONSTANT(-0.207723503645000000)*t; + y[30] += CONSTANT(0.130197596199999990)*t; + y[2] += CONSTANT(0.147319200325000010)*t; + + // [25,25]: 0,6,20, + tf = CONSTANT(0.282094791761999970)*f[0]+CONSTANT(-0.242608896358999990)*f[6]+CONSTANT(0.130197596198000000)*f[20]; + tg = CONSTANT(0.282094791761999970)*g[0]+CONSTANT(-0.242608896358999990)*g[6]+CONSTANT(0.130197596198000000)*g[20]; + y[25] += tf*g[25]+tg*f[25]; + t = f[25]*g[25]; + y[0] += CONSTANT(0.282094791761999970)*t; + y[6] += CONSTANT(-0.242608896358999990)*t; + y[20] += CONSTANT(0.130197596198000000)*t; + + // [26,26]: 6,20,0, + tf = CONSTANT(-0.097043558542400002)*f[6]+CONSTANT(-0.130197596207000000)*f[20]+CONSTANT(0.282094791766000000)*f[0]; + tg = CONSTANT(-0.097043558542400002)*g[6]+CONSTANT(-0.130197596207000000)*g[20]+CONSTANT(0.282094791766000000)*g[0]; + y[26] += tf*g[26]+tg*f[26]; + t = f[26]*g[26]; + y[6] += CONSTANT(-0.097043558542400002)*t; + y[20] += CONSTANT(-0.130197596207000000)*t; + y[0] += CONSTANT(0.282094791766000000)*t; + + // [27,27]: 0,20,6, + tf = CONSTANT(0.282094791770000020)*f[0]+CONSTANT(-0.130197596204999990)*f[20]+CONSTANT(0.016173926423100001)*f[6]; + tg = CONSTANT(0.282094791770000020)*g[0]+CONSTANT(-0.130197596204999990)*g[20]+CONSTANT(0.016173926423100001)*g[6]; + y[27] += tf*g[27]+tg*f[27]; + t = f[27]*g[27]; + y[0] += CONSTANT(0.282094791770000020)*t; + y[20] += CONSTANT(-0.130197596204999990)*t; + y[6] += CONSTANT(0.016173926423100001)*t; + + // [28,28]: 6,0,20,24, + tf = CONSTANT(0.097043558538800007)*f[6]+CONSTANT(0.282094791771999980)*f[0]+CONSTANT(-0.021699599367299999)*f[20]+CONSTANT(-0.128376561118000000)*f[24]; + tg = CONSTANT(0.097043558538800007)*g[6]+CONSTANT(0.282094791771999980)*g[0]+CONSTANT(-0.021699599367299999)*g[20]+CONSTANT(-0.128376561118000000)*g[24]; + y[28] += tf*g[28]+tg*f[28]; + t = f[28]*g[28]; + y[6] += CONSTANT(0.097043558538800007)*t; + y[0] += CONSTANT(0.282094791771999980)*t; + y[20] += CONSTANT(-0.021699599367299999)*t; + y[24] += CONSTANT(-0.128376561118000000)*t; + + // [29,29]: 20,6,0,22,8, + tf = CONSTANT(0.086798397468799998)*f[20]+CONSTANT(0.145565337808999990)*f[6]+CONSTANT(0.282094791773999990)*f[0]+CONSTANT(-0.097043558539500002)*f[22]+CONSTANT(-0.140070311615000000)*f[8]; + tg = CONSTANT(0.086798397468799998)*g[20]+CONSTANT(0.145565337808999990)*g[6]+CONSTANT(0.282094791773999990)*g[0]+CONSTANT(-0.097043558539500002)*g[22]+CONSTANT(-0.140070311615000000)*g[8]; + y[29] += tf*g[29]+tg*f[29]; + t = f[29]*g[29]; + y[20] += CONSTANT(0.086798397468799998)*t; + y[6] += CONSTANT(0.145565337808999990)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + y[22] += CONSTANT(-0.097043558539500002)*t; + y[8] += CONSTANT(-0.140070311615000000)*t; + + // [30,30]: 0,20,6, + tf = CONSTANT(0.282094804531000000)*f[0]+CONSTANT(0.130197634486000000)*f[20]+CONSTANT(0.161739292769000010)*f[6]; + tg = CONSTANT(0.282094804531000000)*g[0]+CONSTANT(0.130197634486000000)*g[20]+CONSTANT(0.161739292769000010)*g[6]; + y[30] += tf*g[30]+tg*f[30]; + t = f[30]*g[30]; + y[0] += CONSTANT(0.282094804531000000)*t; + y[20] += CONSTANT(0.130197634486000000)*t; + y[6] += CONSTANT(0.161739292769000010)*t; + + // [31,31]: 6,8,20,22,0, + tf = CONSTANT(0.145565337808999990)*f[6]+CONSTANT(0.140070311615000000)*f[8]+CONSTANT(0.086798397468799998)*f[20]+CONSTANT(0.097043558539500002)*f[22]+CONSTANT(0.282094791773999990)*f[0]; + tg = CONSTANT(0.145565337808999990)*g[6]+CONSTANT(0.140070311615000000)*g[8]+CONSTANT(0.086798397468799998)*g[20]+CONSTANT(0.097043558539500002)*g[22]+CONSTANT(0.282094791773999990)*g[0]; + y[31] += tf*g[31]+tg*f[31]; + t = f[31]*g[31]; + y[6] += CONSTANT(0.145565337808999990)*t; + y[8] += CONSTANT(0.140070311615000000)*t; + y[20] += CONSTANT(0.086798397468799998)*t; + y[22] += CONSTANT(0.097043558539500002)*t; + y[0] += CONSTANT(0.282094791773999990)*t; + + // [32,32]: 0,24,20,6, + tf = CONSTANT(0.282094791771999980)*f[0]+CONSTANT(0.128376561118000000)*f[24]+CONSTANT(-0.021699599367299999)*f[20]+CONSTANT(0.097043558538800007)*f[6]; + tg = CONSTANT(0.282094791771999980)*g[0]+CONSTANT(0.128376561118000000)*g[24]+CONSTANT(-0.021699599367299999)*g[20]+CONSTANT(0.097043558538800007)*g[6]; + y[32] += tf*g[32]+tg*f[32]; + t = f[32]*g[32]; + y[0] += CONSTANT(0.282094791771999980)*t; + y[24] += CONSTANT(0.128376561118000000)*t; + y[20] += CONSTANT(-0.021699599367299999)*t; + y[6] += CONSTANT(0.097043558538800007)*t; + + // [33,33]: 6,20,0, + tf = CONSTANT(0.016173926423100001)*f[6]+CONSTANT(-0.130197596204999990)*f[20]+CONSTANT(0.282094791770000020)*f[0]; + tg = CONSTANT(0.016173926423100001)*g[6]+CONSTANT(-0.130197596204999990)*g[20]+CONSTANT(0.282094791770000020)*g[0]; + y[33] += tf*g[33]+tg*f[33]; + t = f[33]*g[33]; + y[6] += CONSTANT(0.016173926423100001)*t; + y[20] += CONSTANT(-0.130197596204999990)*t; + y[0] += CONSTANT(0.282094791770000020)*t; + + // [34,34]: 20,6,0, + tf = CONSTANT(-0.130197596207000000)*f[20]+CONSTANT(-0.097043558542400002)*f[6]+CONSTANT(0.282094791766000000)*f[0]; + tg = CONSTANT(-0.130197596207000000)*g[20]+CONSTANT(-0.097043558542400002)*g[6]+CONSTANT(0.282094791766000000)*g[0]; + y[34] += tf*g[34]+tg*f[34]; + t = f[34]*g[34]; + y[20] += CONSTANT(-0.130197596207000000)*t; + y[6] += CONSTANT(-0.097043558542400002)*t; + y[0] += CONSTANT(0.282094791766000000)*t; + + // [35,35]: 6,0,20, + tf = CONSTANT(-0.242608896358999990)*f[6]+CONSTANT(0.282094791761999970)*f[0]+CONSTANT(0.130197596198000000)*f[20]; + tg = CONSTANT(-0.242608896358999990)*g[6]+CONSTANT(0.282094791761999970)*g[0]+CONSTANT(0.130197596198000000)*g[20]; + y[35] += tf*g[35]+tg*f[35]; + t = f[35]*g[35]; + y[6] += CONSTANT(-0.242608896358999990)*t; + y[0] += CONSTANT(0.282094791761999970)*t; + y[20] += CONSTANT(0.130197596198000000)*t; + + // multiply count=2527 + + return y; +} + + +//------------------------------------------------------------------------------------- +// Evaluates a directional light and returns spectral SH data. The output +// vector is computed so that if the intensity of R/G/B is unit the resulting +// exit radiance of a point directly under the light on a diffuse object with +// an albedo of 1 would be 1.0. This will compute 3 spectral samples, resultR +// has to be specified, while resultG and resultB are optional. +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb204988.aspx +//------------------------------------------------------------------------------------- +bool XM_CALLCONV XMSHEvalDirectionalLight( _In_ size_t order, + _In_ FXMVECTOR dir, + _In_ FXMVECTOR color, + _Out_writes_(order*order) float *resultR, + _Out_writes_opt_(order*order) float *resultG, + _Out_writes_opt_(order*order) float *resultB ) +{ + if ( !resultR ) + return false; + + if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ) + return false; + + XMFLOAT3A clr; + XMStoreFloat3A( &clr, color ); + + float fTmp[ XM_SH_MAXORDER * XM_SH_MAXORDER ]; + + XMSHEvalDirection(fTmp,order,dir); // evaluate the BF in this direction... + + // now compute "normalization" and scale vector for each valid spectral band + const float fNorm = XM_PI / CosWtInt(order); + + const size_t numcoeff = order*order; + + const float fRScale = fNorm * clr.x; + + for( size_t i=0; i < numcoeff; ++i) + { + resultR[i] = fTmp[i] * fRScale; + } + + if (resultG) + { + const float fGScale = fNorm * clr.y; + + for( size_t i=0; i < numcoeff; ++i) + { + resultG[i] = fTmp[i] * fGScale; + } + } + + if (resultB) + { + const float fBScale = fNorm * clr.z; + + for( size_t i=0; i < numcoeff; ++i) + { + resultB[i] = fTmp[i]*fBScale; + } + } + + return true; +} + + +//------------------------------------------------------------------------------------ +// Evaluates a spherical light and returns spectral SH data. There is no +// normalization of the intensity of the light like there is for directional +// lights, care has to be taken when specifiying the intensities. This will +// compute 3 spectral samples, resultR has to be specified, while resultG and +// resultB are optional. +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/bb205451.aspx +//------------------------------------------------------------------------------------- +bool XM_CALLCONV XMSHEvalSphericalLight( _In_ size_t order, + _In_ FXMVECTOR pos, + _In_ float radius, + _In_ FXMVECTOR color, + _Out_writes_(order*order) float *resultR, + _Out_writes_opt_(order*order) float *resultG, + _Out_writes_opt_(order*order) float *resultB ) +{ + if ( !resultR ) + return false; + + if ( radius < 0.f ) + return false; + + const float fDist = XMVectorGetX( XMVector3Length( pos ) ); + + // WARNING: fDist should not be < radius - otherwise light contains origin + + //const float fSinConeAngle = (fDist <= radius) ? 0.99999f : radius/fDist; + const float fConeAngle = (fDist <= radius) ? (XM_PIDIV2) : asinf(radius/fDist); + + XMVECTOR dir = XMVector3Normalize( pos ); + + float fTmpDir[ XM_SH_MAXORDER* XM_SH_MAXORDER]; // rotation "vector" + float fTmpL0[ XM_SH_MAXORDER ]; + + // + // Sphere at distance fDist, the cone angle is determined by looking at the + // right triangle with one side (the hypotenuse) beind the vector from the + // origin to the center of the sphere, another side is from the origin to + // a point on the sphere whose normal is perpendicular to the given side (this + // is one of the points on the cone that is defined by the projection of the sphere + // through the origin - we want to find the angle of this cone) and the final + // side being from the center of the sphere to the point of tagency (the two + // sides conected to this are at a right angle by construction.) + // From trig we know that sin(theta) = ||opposite||/||hypotenuse||, where + // ||opposite|| = Radius, ||hypotenuse|| = fDist + // theta is the angle of the cone that subtends the sphere from the origin + // + + // no default normalization is done for this case, have to be careful how + // you represent the coefficients... + + const float fNewNorm = 1.0f;///(fSinConeAngle*fSinConeAngle); + + ComputeCapInt(order,fConeAngle,fTmpL0); + + XMFLOAT3A vd; + XMStoreFloat3( &vd, dir ); + + const float fX = vd.x; + const float fY = vd.y; + const float fZ = vd.z; + + switch (order) + { + case 2: + sh_eval_basis_1(fX,fY,fZ,fTmpDir); + break; + + case 3: + sh_eval_basis_2(fX,fY,fZ,fTmpDir); + break; + + case 4: + sh_eval_basis_3(fX,fY,fZ,fTmpDir); + break; + + case 5: + sh_eval_basis_4(fX,fY,fZ,fTmpDir); + break; + + case 6: + sh_eval_basis_5(fX,fY,fZ,fTmpDir); + break; + + default: + assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ); + return false; + } + + XMFLOAT3A clr; + XMStoreFloat3A( &clr, color ); + + for( size_t i=0; i (XM_PI*1.00001f) ) + return false; + + if (radius < 0.0001f) + { + // turn it into a pure directional light... + return XMSHEvalDirectionalLight(order, dir,color,resultR,resultG,resultB); + } + else + { + float fTmpL0[ XM_SH_MAXORDER ]; + float fTmpDir[ XM_SH_MAXORDER * XM_SH_MAXORDER ]; + + const float fConeAngle = radius; + const float fAngCheck = (fConeAngle > XM_PIDIV2) ? (XM_PIDIV2) : fConeAngle; + + const float fNewNorm = 1.0f/(sinf(fAngCheck)*sinf(fAngCheck)); + + ComputeCapInt(order,fConeAngle,fTmpL0); + + XMFLOAT3A vd; + XMStoreFloat3( &vd, dir ); + + const float fX = vd.x; + const float fY = vd.y; + const float fZ = vd.z; + + switch (order) + { + case 2: + sh_eval_basis_1(fX,fY,fZ,fTmpDir); + break; + + case 3: + sh_eval_basis_2(fX,fY,fZ,fTmpDir); + break; + + case 4: + sh_eval_basis_3(fX,fY,fZ,fTmpDir); + break; + + case 5: + sh_eval_basis_4(fX,fY,fZ,fTmpDir); + break; + + case 6: + sh_eval_basis_5(fX,fY,fZ,fTmpDir); + break; + + default: + assert( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ); + return false; + } + + XMFLOAT3A clr; + XMStoreFloat3A( &clr, color ); + + for( size_t i=0; i XM_SH_MAXORDER ) + return false; + + // seperate "R/G/B colors... + + float fTmpDir[ XM_SH_MAXORDER * XM_SH_MAXORDER]; // rotation "vector" + float fTmpL0[ XM_SH_MAXORDER ]; + + const float fNewNorm = 3.0f/2.0f; // normalizes things for 1 sky color, 0 ground color... + + XMFLOAT3A vd; + XMStoreFloat3( &vd, dir ); + + const float fX = vd.x; + const float fY = vd.y; + const float fZ = vd.z; + + sh_eval_basis_1(fX,fY,fZ,fTmpDir); + + XMFLOAT3A clrTop; + XMStoreFloat3A( &clrTop, topColor ); + + XMFLOAT3A clrBottom; + XMStoreFloat3A( &clrBottom, bottomColor ); + + float fA = clrTop.x; + float fAvrg = (clrTop.x + clrBottom.x)*0.5f; + + fTmpL0[0] = fAvrg*2.0f*SHEvalHemisphereLight_fSqrtPi; + fTmpL0[1] = (fA - fAvrg)*2.0f*SHEvalHemisphereLight_fSqrtPi3; + + size_t i = 0; + for( ; i<2; ++i) + { + _Analysis_assume_(i < order); + const size_t cNumCoefs = 2*i + 1; + const size_t cStart = i*i; + const float fValUse = fTmpL0[i]*fNewNorm*fExtraNormFac[i]; + for( size_t j=0; j - -#include - -struct ID3D11DeviceContext; -struct ID3D11Texture2D; - -namespace DirectX -{ -#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) -#define XM_CALLCONV __fastcall -typedef const DirectX::XMVECTOR& HXMVECTOR; -typedef const DirectX::XMMATRIX& FXMMATRIX; -#endif - -const size_t XM_SH_MINORDER = 2; -const size_t XM_SH_MAXORDER = 6; - -float* XM_CALLCONV XMSHEvalDirection( _Out_writes_(order*order) float *result, _In_ size_t order, _In_ FXMVECTOR dir ); - -float* XM_CALLCONV XMSHRotate( _Out_writes_(order*order) float *result, _In_ size_t order, _In_ FXMMATRIX rotMatrix, _In_reads_(order*order) const float *input ); - -float* XMSHRotateZ( _Out_writes_(order*order) float *result, _In_ size_t order, _In_ float angle, _In_reads_(order*order) const float *input ); - -float* XMSHAdd( _Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB ); - -float* XMSHScale( _Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *input, _In_ float scale ); - -float XMSHDot( _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB ); - -float* XMSHMultiply( _Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *inputF, _In_reads_(order*order) const float *inputG ); - -float* XMSHMultiply2( _Out_writes_(4) float *result, _In_reads_(4) const float *inputF, _In_reads_(4) const float *inputG ); - -float* XMSHMultiply3( _Out_writes_(9) float *result, _In_reads_(9) const float *inputF, _In_reads_(9) const float *inputG ); - -float* XMSHMultiply4( _Out_writes_(16) float *result, _In_reads_(16) const float *inputF, _In_reads_(16) const float *inputG ); - -float* XMSHMultiply5( _Out_writes_(25) float *result, _In_reads_(25) const float *inputF, _In_reads_(25) const float *inputG ); - -float* XMSHMultiply6( _Out_writes_(36) float *result, _In_reads_(36) const float *inputF, _In_reads_(36) const float *inputG ); - -bool XM_CALLCONV XMSHEvalDirectionalLight( _In_ size_t order, _In_ FXMVECTOR dir, _In_ FXMVECTOR color, - _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB ); - -bool XM_CALLCONV XMSHEvalSphericalLight( _In_ size_t order, _In_ FXMVECTOR pos, _In_ float radius, _In_ FXMVECTOR color, - _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB ); - -bool XM_CALLCONV XMSHEvalConeLight( _In_ size_t order, _In_ FXMVECTOR dir, _In_ float radius, _In_ FXMVECTOR color, - _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB ); - -bool XM_CALLCONV XMSHEvalHemisphereLight( _In_ size_t order, _In_ FXMVECTOR dir, _In_ FXMVECTOR topColor, _In_ FXMVECTOR bottomColor, - _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB ); - -HRESULT SHProjectCubeMap( _In_ ID3D11DeviceContext *context, _In_ size_t order, _In_ ID3D11Texture2D *cubeMap, - _Out_writes_opt_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB ); - -}; // namespace DirectX +//------------------------------------------------------------------------------------- +// DirectXSH.h -- C++ Spherical Harmonics Math Library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/p/?LinkId=262885 +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#define DIRECTX_SHMATH_VERSION 102 + +#include + +#include + +struct ID3D11DeviceContext; +struct ID3D11Texture2D; + +namespace DirectX +{ +#if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) +#define XM_CALLCONV __fastcall +typedef const DirectX::XMVECTOR& HXMVECTOR; +typedef const DirectX::XMMATRIX& FXMMATRIX; +#endif + +const size_t XM_SH_MINORDER = 2; +const size_t XM_SH_MAXORDER = 6; + +float* XM_CALLCONV XMSHEvalDirection( _Out_writes_(order*order) float *result, _In_ size_t order, _In_ FXMVECTOR dir ); + +float* XM_CALLCONV XMSHRotate( _Out_writes_(order*order) float *result, _In_ size_t order, _In_ FXMMATRIX rotMatrix, _In_reads_(order*order) const float *input ); + +float* XMSHRotateZ( _Out_writes_(order*order) float *result, _In_ size_t order, _In_ float angle, _In_reads_(order*order) const float *input ); + +float* XMSHAdd( _Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB ); + +float* XMSHScale( _Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *input, _In_ float scale ); + +float XMSHDot( _In_ size_t order, _In_reads_(order*order) const float *inputA, _In_reads_(order*order) const float *inputB ); + +float* XMSHMultiply( _Out_writes_(order*order) float *result, _In_ size_t order, _In_reads_(order*order) const float *inputF, _In_reads_(order*order) const float *inputG ); + +float* XMSHMultiply2( _Out_writes_(4) float *result, _In_reads_(4) const float *inputF, _In_reads_(4) const float *inputG ); + +float* XMSHMultiply3( _Out_writes_(9) float *result, _In_reads_(9) const float *inputF, _In_reads_(9) const float *inputG ); + +float* XMSHMultiply4( _Out_writes_(16) float *result, _In_reads_(16) const float *inputF, _In_reads_(16) const float *inputG ); + +float* XMSHMultiply5( _Out_writes_(25) float *result, _In_reads_(25) const float *inputF, _In_reads_(25) const float *inputG ); + +float* XMSHMultiply6( _Out_writes_(36) float *result, _In_reads_(36) const float *inputF, _In_reads_(36) const float *inputG ); + +bool XM_CALLCONV XMSHEvalDirectionalLight( _In_ size_t order, _In_ FXMVECTOR dir, _In_ FXMVECTOR color, + _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB ); + +bool XM_CALLCONV XMSHEvalSphericalLight( _In_ size_t order, _In_ FXMVECTOR pos, _In_ float radius, _In_ FXMVECTOR color, + _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB ); + +bool XM_CALLCONV XMSHEvalConeLight( _In_ size_t order, _In_ FXMVECTOR dir, _In_ float radius, _In_ FXMVECTOR color, + _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB ); + +bool XM_CALLCONV XMSHEvalHemisphereLight( _In_ size_t order, _In_ FXMVECTOR dir, _In_ FXMVECTOR topColor, _In_ FXMVECTOR bottomColor, + _Out_writes_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB ); + +HRESULT SHProjectCubeMap( _In_ ID3D11DeviceContext *context, _In_ size_t order, _In_ ID3D11Texture2D *cubeMap, + _Out_writes_opt_(order*order) float *resultR, _Out_writes_opt_(order*order) float *resultG, _Out_writes_opt_(order*order) float *resultB ); + +}; // namespace DirectX diff --git a/SHMath/DirectXSHD3D11.cpp b/SHMath/DirectXSHD3D11.cpp index 9556526..73e24ad 100644 --- a/SHMath/DirectXSHD3D11.cpp +++ b/SHMath/DirectXSHD3D11.cpp @@ -1,390 +1,390 @@ -//------------------------------------------------------------------------------------- -// DirectXSHD3D11.cpp -- C++ Spherical Harmonics Math Library -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/p/?LinkId=262885 -//------------------------------------------------------------------------------------- - -#include "DirectXSH.h" - -#include - -#include - -#include -#include -#include - -namespace -{ -struct aligned_deleter { void operator()(void* p) { _aligned_free(p); } }; - -typedef std::unique_ptr ScopedAlignedArrayXMVECTOR; - -template class ScopedObject -{ -public: - explicit ScopedObject( T *p = 0 ) : _pointer(p) {} - ~ScopedObject() - { - if ( _pointer ) - { - _pointer->Release(); - _pointer = nullptr; - } - } - - bool IsNull() const { return (!_pointer); } - - T& operator*() { return *_pointer; } - T* operator->() { return _pointer; } - T** operator&() { return &_pointer; } - - void Reset(T *p = 0) { if ( _pointer ) { _pointer->Release(); } _pointer = p; } - - T* Get() const { return _pointer; } - -private: - ScopedObject(const ScopedObject&); - ScopedObject& operator=(const ScopedObject&); - - T* _pointer; -}; - -//------------------------------------------------------------------------------------- -// This code is lifted from DirectXTex http://directxtex.codeplex.com/ -// If you need additional DXGI format support, see DirectXTexConvert.cpp -//------------------------------------------------------------------------------------- -#define LOAD_SCANLINE( type, func )\ - if ( size >= sizeof(type) )\ - {\ - const type * __restrict sPtr = reinterpret_cast(pSource);\ - for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\ - {\ - if ( dPtr >= ePtr ) break;\ - *(dPtr++) = func( sPtr++ );\ - }\ - return true;\ - }\ - return false; - -#define LOAD_SCANLINE3( type, func, defvec )\ - if ( size >= sizeof(type) )\ - {\ - const type * __restrict sPtr = reinterpret_cast(pSource);\ - for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\ - {\ - XMVECTOR v = func( sPtr++ );\ - if ( dPtr >= ePtr ) break;\ - *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1110 );\ - }\ - return true;\ - }\ - return false; - -#define LOAD_SCANLINE2( type, func, defvec )\ - if ( size >= sizeof(type) )\ - {\ - const type * __restrict sPtr = reinterpret_cast(pSource);\ - for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\ - {\ - XMVECTOR v = func( sPtr++ );\ - if ( dPtr >= ePtr ) break;\ - *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1100 );\ - }\ - return true;\ - }\ - return false; - -#pragma warning(push) -#pragma warning(disable : 6101) -_Success_(return) -static bool _LoadScanline( _Out_writes_(count) DirectX::XMVECTOR* pDestination, _In_ size_t count, - _In_reads_bytes_(size) LPCVOID pSource, _In_ size_t size, _In_ DXGI_FORMAT format ) -{ - assert( pDestination && count > 0 && (((uintptr_t)pDestination & 0xF) == 0) ); - assert( pSource && size > 0 ); - - using namespace DirectX; - using namespace DirectX::PackedVector; - - XMVECTOR* __restrict dPtr = pDestination; - if ( !dPtr ) - return false; - - const XMVECTOR* ePtr = pDestination + count; - - switch( format ) - { - case DXGI_FORMAT_R32G32B32A32_FLOAT: - { - size_t msize = (size > (sizeof(XMVECTOR)*count)) ? (sizeof(XMVECTOR)*count) : size; - memcpy_s( dPtr, sizeof(XMVECTOR)*count, pSource, msize ); - } - return true; - - case DXGI_FORMAT_R32G32B32_FLOAT: - LOAD_SCANLINE3( XMFLOAT3, XMLoadFloat3, g_XMIdentityR3 ) - - case DXGI_FORMAT_R16G16B16A16_FLOAT: - LOAD_SCANLINE( XMHALF4, XMLoadHalf4 ) - - case DXGI_FORMAT_R32G32_FLOAT: - LOAD_SCANLINE2( XMFLOAT2, XMLoadFloat2, g_XMIdentityR3 ) - - case DXGI_FORMAT_R11G11B10_FLOAT: - LOAD_SCANLINE3( XMFLOAT3PK, XMLoadFloat3PK, g_XMIdentityR3 ); - - case DXGI_FORMAT_R16G16_FLOAT: - LOAD_SCANLINE2( XMHALF2, XMLoadHalf2, g_XMIdentityR3 ) - - case DXGI_FORMAT_R32_FLOAT: - if ( size >= sizeof(float) ) - { - const float* __restrict sPtr = reinterpret_cast(pSource); - for( size_t icount = 0; icount < size; icount += sizeof(float) ) - { - XMVECTOR v = XMLoadFloat( sPtr++ ); - if ( dPtr >= ePtr ) break; - *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1000 ); - } - return true; - } - return false; - - case DXGI_FORMAT_R16_FLOAT: - if ( size >= sizeof(HALF) ) - { - const HALF * __restrict sPtr = reinterpret_cast(pSource); - for( size_t icount = 0; icount < size; icount += sizeof(HALF) ) - { - if ( dPtr >= ePtr ) break; - *(dPtr++) = XMVectorSet( XMConvertHalfToFloat(*sPtr++), 0.f, 0.f, 1.f ); - } - return true; - } - return false; - - default: - return false; - } -} -#pragma warning(pop) - -}; // namespace anonymous - -namespace DirectX -{ - -//------------------------------------------------------------------------------------- -// Projects a function represented in a cube map into spherical harmonics. -// -// http://msdn.microsoft.com/en-us/library/windows/desktop/ff476300.aspx -//------------------------------------------------------------------------------------- -HRESULT SHProjectCubeMap( _In_ ID3D11DeviceContext *context, - _In_ size_t order, - _In_ ID3D11Texture2D *cubeMap, - _Out_writes_opt_(order*order) float *resultR, - _Out_writes_opt_(order*order) float *resultG, - _Out_writes_opt_(order*order) float* resultB ) -{ - if ( !context || !cubeMap ) - return E_INVALIDARG; - - if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ) - return E_INVALIDARG; - - D3D11_TEXTURE2D_DESC desc; - cubeMap->GetDesc( &desc ); - - if ( (desc.ArraySize != 6) - || (desc.Width != desc.Height) - || (desc.SampleDesc.Count > 1) ) - return E_FAIL; - - switch( desc.Format ) - { - case DXGI_FORMAT_R32G32B32A32_FLOAT: - case DXGI_FORMAT_R32G32B32_FLOAT: - case DXGI_FORMAT_R16G16B16A16_FLOAT: - case DXGI_FORMAT_R32G32_FLOAT: - case DXGI_FORMAT_R11G11B10_FLOAT: - case DXGI_FORMAT_R16G16_FLOAT: - case DXGI_FORMAT_R32_FLOAT: - case DXGI_FORMAT_R16_FLOAT: - // See _LoadScanline to support more pixel formats - break; - - default: - return E_FAIL; - } - - //--- Create a staging resource copy (if needed) to be able to read data - ID3D11Texture2D* texture = nullptr; - - ScopedObject staging; - if ( !(desc.CPUAccessFlags & D3D11_CPU_ACCESS_READ) ) - { - D3D11_TEXTURE2D_DESC sdesc = desc; - sdesc.BindFlags = 0; - sdesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; - sdesc.Usage = D3D11_USAGE_STAGING; - - ScopedObject device; - context->GetDevice( &device ); - assert( !device.IsNull() ); - - HRESULT hr = device->CreateTexture2D( &sdesc, nullptr, &staging ); - if ( FAILED(hr) ) - return hr; - - context->CopyResource( staging.Get(), cubeMap ); - - texture = staging.Get(); - } - else - texture = cubeMap; - - assert( texture != 0 ); - - //--- Setup for SH projection - ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast( _aligned_malloc( sizeof(XMVECTOR)*desc.Width, 16 ) ) ); - if ( !scanline ) - return E_OUTOFMEMORY; - - assert( desc.Width > 0 ); - float fSize = static_cast( desc.Width ); - float fPicSize = 1.0f / fSize; - - // index from [0,W-1], f(0) maps to -1 + 1/W, f(W-1) maps to 1 - 1/w - // linear function x*S +B, 1st constraint means B is (-1+1/W), plug into - // second and solve for S: S = 2*(1-1/W)/(W-1). The old code that did - // this was incorrect - but only for computing the differential solid - // angle, where the final value was 1.0 instead of 1-1/w... - - float fB = -1.0f + 1.0f/fSize; - float fS = ( desc.Width > 1 ) ? (2.0f*(1.0f-1.0f/fSize)/(fSize-1.0f)) : 0.f; - - // clear out accumulation variables - float fWt = 0.0f; - - if ( resultR ) - memset( resultR, 0, sizeof(float)*order*order ); - if ( resultG ) - memset( resultG, 0, sizeof(float)*order*order ); - if ( resultB ) - memset( resultB, 0, sizeof(float)*order*order ); - - float shBuff[XM_SH_MAXORDER*XM_SH_MAXORDER]; - float shBuffB[XM_SH_MAXORDER*XM_SH_MAXORDER]; - - //--- Process each face of the cubemap - for (UINT face=0; face < 6; ++face ) - { - UINT dindex = D3D11CalcSubresource( 0, face, desc.MipLevels ); - - D3D11_MAPPED_SUBRESOURCE mapped; - HRESULT hr = context->Map( texture, dindex, D3D11_MAP_READ, 0, &mapped ); - if ( FAILED(hr) ) - return hr; - - const uint8_t *pSrc = reinterpret_cast(mapped.pData); - for( UINT y=0; y < desc.Height; ++y ) - { - XMVECTOR* ptr = scanline.get(); - if ( !_LoadScanline( ptr, desc.Width, pSrc, mapped.RowPitch, desc.Format ) ) - { - context->Unmap( texture, dindex ); - return E_FAIL; - } - - const float fV = y*fS + fB; - - XMVECTOR* pixel = ptr; - for( UINT x=0; x < desc.Width; ++x, ++pixel ) - { - const float fU = x*fS + fB; - - float ix, iy, iz; - switch( face ) - { - case 0: // Positive X - iz = 1.0f - (2.0f * (float)x + 1.0f) * fPicSize; - iy = 1.0f - (2.0f * (float)y + 1.0f) * fPicSize; - ix = 1.0f; - break; - - case 1: // Negative X - iz = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize; - iy = 1.0f - (2.0f * (float)y + 1.0f) * fPicSize; - ix = -1; - break; - - case 2: // Positive Y - iz = -1.0f + (2.0f * (float)y + 1.0f) * fPicSize; - iy = 1.0f; - ix = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize; - break; - - case 3: // Negative Y - iz = 1.0f - (2.0f * (float)y + 1.0f) * fPicSize; - iy = -1.0f; - ix = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize; - break; - - case 4: // Positive Z - iz = 1.0f; - iy = 1.0f - (2.0f * (float)y + 1.0f) * fPicSize; - ix = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize; - break; - - case 5: // Negative Z - iz = -1.0f; - iy = 1.0f - (2.0f * (float)y + 1.0f) * fPicSize; - ix = 1.0f - (2.0f * (float)x + 1.0f) * fPicSize; - break; - - default: - ix = iy = iz = 0.f; - assert(false); - break; - } - - XMVECTOR dir = XMVectorSet( ix, iy, iz, 0 ); - dir = XMVector3Normalize( dir ); - - const float fDiffSolid = 4.0f/((1.0f + fU*fU + fV*fV)*sqrtf(1.0f + fU*fU+fV*fV)); - fWt += fDiffSolid; - - XMSHEvalDirection(shBuff,order,dir); - - XMFLOAT3A clr; - XMStoreFloat3A( &clr, *pixel ); - - if ( resultR ) XMSHAdd(resultR,order,resultR, XMSHScale(shBuffB,order,shBuff,clr.x*fDiffSolid) ); - if ( resultG ) XMSHAdd(resultG,order,resultG, XMSHScale(shBuffB,order,shBuff,clr.y*fDiffSolid) ); - if ( resultB ) XMSHAdd(resultB,order,resultB, XMSHScale(shBuffB,order,shBuff,clr.z*fDiffSolid) ); - } - - pSrc += mapped.RowPitch; - } - - context->Unmap( texture, dindex ); - } - - const float fNormProj = (4.0f*XM_PI)/fWt; - - if ( resultR ) XMSHScale(resultR,order,resultR,fNormProj); - if ( resultG ) XMSHScale(resultG,order,resultG,fNormProj); - if ( resultB ) XMSHScale(resultB,order,resultB,fNormProj); - - return S_OK; -} - -}; // namespace DirectX +//------------------------------------------------------------------------------------- +// DirectXSHD3D11.cpp -- C++ Spherical Harmonics Math Library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/p/?LinkId=262885 +//------------------------------------------------------------------------------------- + +#include "DirectXSH.h" + +#include + +#include + +#include +#include +#include + +namespace +{ +struct aligned_deleter { void operator()(void* p) { _aligned_free(p); } }; + +typedef std::unique_ptr ScopedAlignedArrayXMVECTOR; + +template class ScopedObject +{ +public: + explicit ScopedObject( T *p = 0 ) : _pointer(p) {} + ~ScopedObject() + { + if ( _pointer ) + { + _pointer->Release(); + _pointer = nullptr; + } + } + + bool IsNull() const { return (!_pointer); } + + T& operator*() { return *_pointer; } + T* operator->() { return _pointer; } + T** operator&() { return &_pointer; } + + void Reset(T *p = 0) { if ( _pointer ) { _pointer->Release(); } _pointer = p; } + + T* Get() const { return _pointer; } + +private: + ScopedObject(const ScopedObject&); + ScopedObject& operator=(const ScopedObject&); + + T* _pointer; +}; + +//------------------------------------------------------------------------------------- +// This code is lifted from DirectXTex http://directxtex.codeplex.com/ +// If you need additional DXGI format support, see DirectXTexConvert.cpp +//------------------------------------------------------------------------------------- +#define LOAD_SCANLINE( type, func )\ + if ( size >= sizeof(type) )\ + {\ + const type * __restrict sPtr = reinterpret_cast(pSource);\ + for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\ + {\ + if ( dPtr >= ePtr ) break;\ + *(dPtr++) = func( sPtr++ );\ + }\ + return true;\ + }\ + return false; + +#define LOAD_SCANLINE3( type, func, defvec )\ + if ( size >= sizeof(type) )\ + {\ + const type * __restrict sPtr = reinterpret_cast(pSource);\ + for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\ + {\ + XMVECTOR v = func( sPtr++ );\ + if ( dPtr >= ePtr ) break;\ + *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1110 );\ + }\ + return true;\ + }\ + return false; + +#define LOAD_SCANLINE2( type, func, defvec )\ + if ( size >= sizeof(type) )\ + {\ + const type * __restrict sPtr = reinterpret_cast(pSource);\ + for( size_t icount = 0; icount < ( size - sizeof(type) + 1 ); icount += sizeof(type) )\ + {\ + XMVECTOR v = func( sPtr++ );\ + if ( dPtr >= ePtr ) break;\ + *(dPtr++) = XMVectorSelect( defvec, v, g_XMSelect1100 );\ + }\ + return true;\ + }\ + return false; + +#pragma warning(push) +#pragma warning(disable : 6101) +_Success_(return) +static bool _LoadScanline( _Out_writes_(count) DirectX::XMVECTOR* pDestination, _In_ size_t count, + _In_reads_bytes_(size) LPCVOID pSource, _In_ size_t size, _In_ DXGI_FORMAT format ) +{ + assert( pDestination && count > 0 && (((uintptr_t)pDestination & 0xF) == 0) ); + assert( pSource && size > 0 ); + + using namespace DirectX; + using namespace DirectX::PackedVector; + + XMVECTOR* __restrict dPtr = pDestination; + if ( !dPtr ) + return false; + + const XMVECTOR* ePtr = pDestination + count; + + switch( format ) + { + case DXGI_FORMAT_R32G32B32A32_FLOAT: + { + size_t msize = (size > (sizeof(XMVECTOR)*count)) ? (sizeof(XMVECTOR)*count) : size; + memcpy_s( dPtr, sizeof(XMVECTOR)*count, pSource, msize ); + } + return true; + + case DXGI_FORMAT_R32G32B32_FLOAT: + LOAD_SCANLINE3( XMFLOAT3, XMLoadFloat3, g_XMIdentityR3 ) + + case DXGI_FORMAT_R16G16B16A16_FLOAT: + LOAD_SCANLINE( XMHALF4, XMLoadHalf4 ) + + case DXGI_FORMAT_R32G32_FLOAT: + LOAD_SCANLINE2( XMFLOAT2, XMLoadFloat2, g_XMIdentityR3 ) + + case DXGI_FORMAT_R11G11B10_FLOAT: + LOAD_SCANLINE3( XMFLOAT3PK, XMLoadFloat3PK, g_XMIdentityR3 ); + + case DXGI_FORMAT_R16G16_FLOAT: + LOAD_SCANLINE2( XMHALF2, XMLoadHalf2, g_XMIdentityR3 ) + + case DXGI_FORMAT_R32_FLOAT: + if ( size >= sizeof(float) ) + { + const float* __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(float) ) + { + XMVECTOR v = XMLoadFloat( sPtr++ ); + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSelect( g_XMIdentityR3, v, g_XMSelect1000 ); + } + return true; + } + return false; + + case DXGI_FORMAT_R16_FLOAT: + if ( size >= sizeof(HALF) ) + { + const HALF * __restrict sPtr = reinterpret_cast(pSource); + for( size_t icount = 0; icount < size; icount += sizeof(HALF) ) + { + if ( dPtr >= ePtr ) break; + *(dPtr++) = XMVectorSet( XMConvertHalfToFloat(*sPtr++), 0.f, 0.f, 1.f ); + } + return true; + } + return false; + + default: + return false; + } +} +#pragma warning(pop) + +}; // namespace anonymous + +namespace DirectX +{ + +//------------------------------------------------------------------------------------- +// Projects a function represented in a cube map into spherical harmonics. +// +// http://msdn.microsoft.com/en-us/library/windows/desktop/ff476300.aspx +//------------------------------------------------------------------------------------- +HRESULT SHProjectCubeMap( _In_ ID3D11DeviceContext *context, + _In_ size_t order, + _In_ ID3D11Texture2D *cubeMap, + _Out_writes_opt_(order*order) float *resultR, + _Out_writes_opt_(order*order) float *resultG, + _Out_writes_opt_(order*order) float* resultB ) +{ + if ( !context || !cubeMap ) + return E_INVALIDARG; + + if ( order < XM_SH_MINORDER || order > XM_SH_MAXORDER ) + return E_INVALIDARG; + + D3D11_TEXTURE2D_DESC desc; + cubeMap->GetDesc( &desc ); + + if ( (desc.ArraySize != 6) + || (desc.Width != desc.Height) + || (desc.SampleDesc.Count > 1) ) + return E_FAIL; + + switch( desc.Format ) + { + case DXGI_FORMAT_R32G32B32A32_FLOAT: + case DXGI_FORMAT_R32G32B32_FLOAT: + case DXGI_FORMAT_R16G16B16A16_FLOAT: + case DXGI_FORMAT_R32G32_FLOAT: + case DXGI_FORMAT_R11G11B10_FLOAT: + case DXGI_FORMAT_R16G16_FLOAT: + case DXGI_FORMAT_R32_FLOAT: + case DXGI_FORMAT_R16_FLOAT: + // See _LoadScanline to support more pixel formats + break; + + default: + return E_FAIL; + } + + //--- Create a staging resource copy (if needed) to be able to read data + ID3D11Texture2D* texture = nullptr; + + ScopedObject staging; + if ( !(desc.CPUAccessFlags & D3D11_CPU_ACCESS_READ) ) + { + D3D11_TEXTURE2D_DESC sdesc = desc; + sdesc.BindFlags = 0; + sdesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + sdesc.Usage = D3D11_USAGE_STAGING; + + ScopedObject device; + context->GetDevice( &device ); + assert( !device.IsNull() ); + + HRESULT hr = device->CreateTexture2D( &sdesc, nullptr, &staging ); + if ( FAILED(hr) ) + return hr; + + context->CopyResource( staging.Get(), cubeMap ); + + texture = staging.Get(); + } + else + texture = cubeMap; + + assert( texture != 0 ); + + //--- Setup for SH projection + ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast( _aligned_malloc( sizeof(XMVECTOR)*desc.Width, 16 ) ) ); + if ( !scanline ) + return E_OUTOFMEMORY; + + assert( desc.Width > 0 ); + float fSize = static_cast( desc.Width ); + float fPicSize = 1.0f / fSize; + + // index from [0,W-1], f(0) maps to -1 + 1/W, f(W-1) maps to 1 - 1/w + // linear function x*S +B, 1st constraint means B is (-1+1/W), plug into + // second and solve for S: S = 2*(1-1/W)/(W-1). The old code that did + // this was incorrect - but only for computing the differential solid + // angle, where the final value was 1.0 instead of 1-1/w... + + float fB = -1.0f + 1.0f/fSize; + float fS = ( desc.Width > 1 ) ? (2.0f*(1.0f-1.0f/fSize)/(fSize-1.0f)) : 0.f; + + // clear out accumulation variables + float fWt = 0.0f; + + if ( resultR ) + memset( resultR, 0, sizeof(float)*order*order ); + if ( resultG ) + memset( resultG, 0, sizeof(float)*order*order ); + if ( resultB ) + memset( resultB, 0, sizeof(float)*order*order ); + + float shBuff[XM_SH_MAXORDER*XM_SH_MAXORDER]; + float shBuffB[XM_SH_MAXORDER*XM_SH_MAXORDER]; + + //--- Process each face of the cubemap + for (UINT face=0; face < 6; ++face ) + { + UINT dindex = D3D11CalcSubresource( 0, face, desc.MipLevels ); + + D3D11_MAPPED_SUBRESOURCE mapped; + HRESULT hr = context->Map( texture, dindex, D3D11_MAP_READ, 0, &mapped ); + if ( FAILED(hr) ) + return hr; + + const uint8_t *pSrc = reinterpret_cast(mapped.pData); + for( UINT y=0; y < desc.Height; ++y ) + { + XMVECTOR* ptr = scanline.get(); + if ( !_LoadScanline( ptr, desc.Width, pSrc, mapped.RowPitch, desc.Format ) ) + { + context->Unmap( texture, dindex ); + return E_FAIL; + } + + const float fV = y*fS + fB; + + XMVECTOR* pixel = ptr; + for( UINT x=0; x < desc.Width; ++x, ++pixel ) + { + const float fU = x*fS + fB; + + float ix, iy, iz; + switch( face ) + { + case 0: // Positive X + iz = 1.0f - (2.0f * (float)x + 1.0f) * fPicSize; + iy = 1.0f - (2.0f * (float)y + 1.0f) * fPicSize; + ix = 1.0f; + break; + + case 1: // Negative X + iz = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize; + iy = 1.0f - (2.0f * (float)y + 1.0f) * fPicSize; + ix = -1; + break; + + case 2: // Positive Y + iz = -1.0f + (2.0f * (float)y + 1.0f) * fPicSize; + iy = 1.0f; + ix = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize; + break; + + case 3: // Negative Y + iz = 1.0f - (2.0f * (float)y + 1.0f) * fPicSize; + iy = -1.0f; + ix = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize; + break; + + case 4: // Positive Z + iz = 1.0f; + iy = 1.0f - (2.0f * (float)y + 1.0f) * fPicSize; + ix = -1.0f + (2.0f * (float)x + 1.0f) * fPicSize; + break; + + case 5: // Negative Z + iz = -1.0f; + iy = 1.0f - (2.0f * (float)y + 1.0f) * fPicSize; + ix = 1.0f - (2.0f * (float)x + 1.0f) * fPicSize; + break; + + default: + ix = iy = iz = 0.f; + assert(false); + break; + } + + XMVECTOR dir = XMVectorSet( ix, iy, iz, 0 ); + dir = XMVector3Normalize( dir ); + + const float fDiffSolid = 4.0f/((1.0f + fU*fU + fV*fV)*sqrtf(1.0f + fU*fU+fV*fV)); + fWt += fDiffSolid; + + XMSHEvalDirection(shBuff,order,dir); + + XMFLOAT3A clr; + XMStoreFloat3A( &clr, *pixel ); + + if ( resultR ) XMSHAdd(resultR,order,resultR, XMSHScale(shBuffB,order,shBuff,clr.x*fDiffSolid) ); + if ( resultG ) XMSHAdd(resultG,order,resultG, XMSHScale(shBuffB,order,shBuff,clr.y*fDiffSolid) ); + if ( resultB ) XMSHAdd(resultB,order,resultB, XMSHScale(shBuffB,order,shBuff,clr.z*fDiffSolid) ); + } + + pSrc += mapped.RowPitch; + } + + context->Unmap( texture, dindex ); + } + + const float fNormProj = (4.0f*XM_PI)/fWt; + + if ( resultR ) XMSHScale(resultR,order,resultR,fNormProj); + if ( resultG ) XMSHScale(resultG,order,resultG,fNormProj); + if ( resultB ) XMSHScale(resultB,order,resultB,fNormProj); + + return S_OK; +} + +}; // namespace DirectX diff --git a/XDSP/XDSP.h b/XDSP/XDSP.h index 28dfe8b..802386f 100644 --- a/XDSP/XDSP.h +++ b/XDSP/XDSP.h @@ -1,811 +1,811 @@ -//-------------------------------------------------------------------------------------- -// File: XDSP.h -// -// DirectXMath based Digital Signal Processing (DSP) functions for audio, -// primarily Fast Fourier Transform (FFT) -// -// All buffer parameters must be 16-byte aligned -// -// All FFT functions support only single-precision floating-point audio -// -// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF -// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A -// PARTICULAR PURPOSE. -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// http://go.microsoft.com/fwlink/?LinkID=615557 -//-------------------------------------------------------------------------------------- - -#pragma once - -#include -#include - -#pragma warning(push) -#pragma warning(disable : 4005 4668) -#include -#pragma warning(pop) - -#pragma warning(push) -#pragma warning(disable: 4328 4640 6001 6262) - -namespace XDSP -{ - #if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) - #define XM_CALLCONV __fastcall - typedef const DirectX::XMVECTOR& HXMVECTOR; - typedef const DirectX::XMMATRIX& FXMMATRIX; - #endif - - typedef DirectX::XMVECTOR XMVECTOR; - typedef DirectX::FXMVECTOR FXMVECTOR; - typedef DirectX::GXMVECTOR GXMVECTOR; - typedef DirectX::CXMVECTOR CXMVECTOR; - - inline bool ISPOWEROF2(size_t n) { return ( ((n)&((n)-1)) == 0 && (n) != 0 ); } - - // Parallel multiplication of four complex numbers, assuming real and imaginary values are stored in separate vectors. - __forceinline void XM_CALLCONV vmulComplex (_Out_ XMVECTOR& rResult, _Out_ XMVECTOR& iResult, - _In_ FXMVECTOR r1, _In_ FXMVECTOR i1, _In_ FXMVECTOR r2, _In_ GXMVECTOR i2) - { - using namespace DirectX; - // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1) - XMVECTOR vi1i2 = XMVectorMultiply(i1, i2); - XMVECTOR vr1r2 = XMVectorMultiply(r1, r2); - XMVECTOR vr1i2 = XMVectorMultiply(r1, i2); - XMVECTOR vr2i1 = XMVectorMultiply(r2, i1); - rResult = XMVectorSubtract(vr1r2, vi1i2); // real: (r1*r2 - i1*i2) - iResult = XMVectorAdd(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1) - } - - __forceinline void XM_CALLCONV vmulComplex (_Inout_ XMVECTOR& r1, _Inout_ XMVECTOR& i1, _In_ FXMVECTOR r2, _In_ FXMVECTOR i2) - { - using namespace DirectX; - // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1) - XMVECTOR vi1i2 = XMVectorMultiply(i1, i2); - XMVECTOR vr1r2 = XMVectorMultiply(r1, r2); - XMVECTOR vr1i2 = XMVectorMultiply(r1, i2); - XMVECTOR vr2i1 = XMVectorMultiply(r2, i1); - r1 = XMVectorSubtract(vr1r2, vi1i2); // real: (r1*r2 - i1*i2) - i1 = XMVectorAdd(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1) - } - - //---------------------------------------------------------------------------------- - // Radix-4 decimation-in-time FFT butterfly. - // This version assumes that all four elements of the butterfly are - // adjacent in a single vector. - // - // Compute the product of the complex input vector and the - // 4-element DFT matrix: - // | 1 1 1 1 | | (r1X,i1X) | - // | 1 -j -1 j | | (r1Y,i1Y) | - // | 1 -1 1 -1 | | (r1Z,i1Z) | - // | 1 j -1 -j | | (r1W,i1W) | - // - // This matrix can be decomposed into two simpler ones to reduce the - // number of additions needed. The decomposed matrices look like this: - // | 1 0 1 0 | | 1 0 1 0 | - // | 0 1 0 -j | | 1 0 -1 0 | - // | 1 0 -1 0 | | 0 1 0 1 | - // | 0 1 0 j | | 0 1 0 -1 | - // - // Combine as follows: - // | 1 0 1 0 | | (r1X,i1X) | | (r1X + r1Z, i1X + i1Z) | - // Temp = | 1 0 -1 0 | * | (r1Y,i1Y) | = | (r1X - r1Z, i1X - i1Z) | - // | 0 1 0 1 | | (r1Z,i1Z) | | (r1Y + r1W, i1Y + i1W) | - // | 0 1 0 -1 | | (r1W,i1W) | | (r1Y - r1W, i1Y - i1W) | - // - // | 1 0 1 0 | | (rTempX,iTempX) | | (rTempX + rTempZ, iTempX + iTempZ) | - // Result = | 0 1 0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) | - // | 1 0 -1 0 | | (rTempZ,iTempZ) | | (rTempX - rTempZ, iTempX - iTempZ) | - // | 0 1 0 j | | (rTempW,iTempW) | | (rTempY - iTempW, iTempY + rTempW) | - //---------------------------------------------------------------------------------- - __forceinline void ButterflyDIT4_1 (_Inout_ XMVECTOR& r1, _Inout_ XMVECTOR& i1) - { - using namespace DirectX; - - // sign constants for radix-4 butterflies - const static XMVECTORF32 vDFT4SignBits1 = { 1.0f, -1.0f, 1.0f, -1.0f }; - const static XMVECTORF32 vDFT4SignBits2 = { 1.0f, 1.0f, -1.0f, -1.0f }; - const static XMVECTORF32 vDFT4SignBits3 = { 1.0f, -1.0f, -1.0f, 1.0f }; - - // calculating Temp - // [r1X| r1X|r1Y| r1Y] + [r1Z|-r1Z|r1W|-r1W] - // [i1X| i1X|i1Y| i1Y] + [i1Z|-i1Z|i1W|-i1W] - XMVECTOR r1L = XMVectorSwizzle<0,0,1,1>( r1 ); - XMVECTOR r1H = XMVectorSwizzle<2,2,3,3>( r1 ); - - XMVECTOR i1L = XMVectorSwizzle<0,0,1,1>( i1 ); - XMVECTOR i1H = XMVectorSwizzle<2,2,3,3>( i1 ); - - XMVECTOR rTemp = XMVectorMultiplyAdd( r1H, vDFT4SignBits1, r1L ); - XMVECTOR iTemp = XMVectorMultiplyAdd( i1H, vDFT4SignBits1, i1L ); - - // calculating Result - XMVECTOR rZrWiZiW = XMVectorPermute<2,3,6,7>(rTemp,iTemp); // [rTempZ|rTempW|iTempZ|iTempW] - XMVECTOR rZiWrZiW = XMVectorSwizzle<0,3,0,3>(rZrWiZiW); // [rTempZ|iTempW|rTempZ|iTempW] - XMVECTOR iZrWiZrW = XMVectorSwizzle<2,1,2,1>(rZrWiZiW); // [rTempZ|iTempW|rTempZ|iTempW] - - // [rTempX| rTempY| rTempX| rTempY] + [rTempZ| iTempW|-rTempZ|-iTempW] - // [iTempX| iTempY| iTempX| iTempY] + // [iTempZ|-rTempW|-iTempZ| rTempW] - XMVECTOR rTempL = XMVectorSwizzle<0,1,0,1>(rTemp); - XMVECTOR iTempL = XMVectorSwizzle<0,1,0,1>(iTemp); - - r1 = XMVectorMultiplyAdd( rZiWrZiW, vDFT4SignBits2, rTempL ); - i1 = XMVectorMultiplyAdd( iZrWiZrW, vDFT4SignBits3, iTempL ); - } - - //---------------------------------------------------------------------------------- - // Radix-4 decimation-in-time FFT butterfly. - // This version assumes that elements of the butterfly are - // in different vectors, so that each vector in the input - // contains elements from four different butterflies. - // The four separate butterflies are processed in parallel. - // - // The calculations here are the same as the ones in the single-vector - // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W) - // they are done in parallel on sixteen independent complex values. - // There is no interdependence between the vector elements: - // | 1 0 1 0 | | (rIn0,iIn0) | | (rIn0 + rIn2, iIn0 + iIn2) | - // | 1 0 -1 0 | * | (rIn1,iIn1) | = Temp = | (rIn0 - rIn2, iIn0 - iIn2) | - // | 0 1 0 1 | | (rIn2,iIn2) | | (rIn1 + rIn3, iIn1 + iIn3) | - // | 0 1 0 -1 | | (rIn3,iIn3) | | (rIn1 - rIn3, iIn1 - iIn3) | - // - // | 1 0 1 0 | | (rTemp0,iTemp0) | | (rTemp0 + rTemp2, iTemp0 + iTemp2) | - // Result = | 0 1 0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) | - // | 1 0 -1 0 | | (rTemp2,iTemp2) | | (rTemp0 - rTemp2, iTemp0 - iTemp2) | - // | 0 1 0 j | | (rTemp3,iTemp3) | | (rTemp1 - iTemp3, iTemp1 + rTemp3) | - //---------------------------------------------------------------------------------- - __forceinline void ButterflyDIT4_4 (_Inout_ XMVECTOR& r0, - _Inout_ XMVECTOR& r1, - _Inout_ XMVECTOR& r2, - _Inout_ XMVECTOR& r3, - _Inout_ XMVECTOR& i0, - _Inout_ XMVECTOR& i1, - _Inout_ XMVECTOR& i2, - _Inout_ XMVECTOR& i3, - _In_reads_(uStride*4) const XMVECTOR* __restrict pUnityTableReal, - _In_reads_(uStride*4) const XMVECTOR* __restrict pUnityTableImaginary, - _In_ size_t uStride, - _In_ const bool fLast) - { - using namespace DirectX; - - assert(pUnityTableReal); - assert(pUnityTableImaginary); - assert((uintptr_t)pUnityTableReal % 16 == 0); - assert((uintptr_t)pUnityTableImaginary % 16 == 0); - assert(ISPOWEROF2(uStride)); - - // calculating Temp - XMVECTOR rTemp0 = XMVectorAdd(r0, r2); - XMVECTOR iTemp0 = XMVectorAdd(i0, i2); - - XMVECTOR rTemp2 = XMVectorAdd(r1, r3); - XMVECTOR iTemp2 = XMVectorAdd(i1, i3); - - XMVECTOR rTemp1 = XMVectorSubtract(r0, r2); - XMVECTOR iTemp1 = XMVectorSubtract(i0, i2); - - XMVECTOR rTemp3 = XMVectorSubtract(r1, r3); - XMVECTOR iTemp3 = XMVectorSubtract(i1, i3); - - XMVECTOR rTemp4 = XMVectorAdd(rTemp0, rTemp2); - XMVECTOR iTemp4 = XMVectorAdd(iTemp0, iTemp2); - - XMVECTOR rTemp5 = XMVectorAdd(rTemp1, iTemp3); - XMVECTOR iTemp5 = XMVectorSubtract(iTemp1, rTemp3); - - XMVECTOR rTemp6 = XMVectorSubtract(rTemp0, rTemp2); - XMVECTOR iTemp6 = XMVectorSubtract(iTemp0, iTemp2); - - XMVECTOR rTemp7 = XMVectorSubtract(rTemp1, iTemp3); - XMVECTOR iTemp7 = XMVectorAdd(iTemp1, rTemp3); - - // calculating Result - // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial - vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]); - vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride*2], pUnityTableImaginary[uStride*2]); - vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride*3], pUnityTableImaginary[uStride*3]); - - if (fLast) - { - ButterflyDIT4_1(rTemp4, iTemp4); - ButterflyDIT4_1(rTemp5, iTemp5); - ButterflyDIT4_1(rTemp6, iTemp6); - ButterflyDIT4_1(rTemp7, iTemp7); - } - - r0 = rTemp4; i0 = iTemp4; - r1 = rTemp5; i1 = iTemp5; - r2 = rTemp6; i2 = iTemp6; - r3 = rTemp7; i3 = iTemp7; - } - - //================================================================================== - // F-U-N-C-T-I-O-N-S - //================================================================================== - - //---------------------------------------------------------------------------------- - // DESCRIPTION: - // 4-sample FFT. - // - // PARAMETERS: - // pReal - [inout] real components, must have at least uCount elements - // pImaginary - [inout] imaginary components, must have at least uCount elements - // uCount - [in] number of FFT iterations - //---------------------------------------------------------------------------------- - __forceinline void FFT4(_Inout_updates_(uCount) XMVECTOR* __restrict pReal, - _Inout_updates_(uCount) XMVECTOR* __restrict pImaginary, - _In_ const size_t uCount=1) - { - assert(pReal); - assert(pImaginary); - assert((uintptr_t)pReal % 16 == 0); - assert((uintptr_t)pImaginary % 16 == 0); - assert(ISPOWEROF2(uCount)); - - for (size_t uIndex=0; uIndex < uCount; ++uIndex) - { - ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]); - } - } - - //---------------------------------------------------------------------------------- - // DESCRIPTION: - // 8-sample FFT. - // - // PARAMETERS: - // pReal - [inout] real components, must have at least uCount*2 elements - // pImaginary - [inout] imaginary components, must have at least uCount*2 elements - // uCount - [in] number of FFT iterations - //---------------------------------------------------------------------------------- - __forceinline void FFT8 (_Inout_updates_(uCount*2) XMVECTOR* __restrict pReal, - _Inout_updates_(uCount*2) XMVECTOR* __restrict pImaginary, - _In_ const size_t uCount=1) - { - using namespace DirectX; - - assert(pReal); - assert(pImaginary); - assert((uintptr_t)pReal % 16 == 0); - assert((uintptr_t)pImaginary % 16 == 0); - assert(ISPOWEROF2(uCount)); - - static const XMVECTORF32 wr1 = { 1.0f, 0.70710677f, 0.0f, -0.70710677f }; - static const XMVECTORF32 wi1 = { 0.0f, -0.70710677f, -1.0f, -0.70710677f }; - static const XMVECTORF32 wr2 = { -1.0f, -0.70710677f, 0.0f, 0.70710677f }; - static const XMVECTORF32 wi2 = { 0.0f, 0.70710677f, 1.0f, 0.70710677f }; - - for (size_t uIndex=0; uIndex < uCount; ++uIndex) - { - XMVECTOR* __restrict pR = pReal + uIndex*2; - XMVECTOR* __restrict pI = pImaginary + uIndex*2; - - XMVECTOR oddsR = XMVectorPermute<1,3,5,7>(pR[0], pR[1]); - XMVECTOR evensR = XMVectorPermute<0,2,4,6>(pR[0], pR[1]); - XMVECTOR oddsI = XMVectorPermute<1,3,5,7>(pI[0], pI[1]); - XMVECTOR evensI = XMVectorPermute<0,2,4,6>(pI[0], pI[1]); - ButterflyDIT4_1(oddsR, oddsI); - ButterflyDIT4_1(evensR, evensI); - - XMVECTOR r, i; - vmulComplex(r, i, oddsR, oddsI, wr1, wi1); - pR[0] = XMVectorAdd(evensR, r); - pI[0] = XMVectorAdd(evensI, i); - - vmulComplex(r, i, oddsR, oddsI, wr2, wi2); - pR[1] = XMVectorAdd(evensR, r); - pI[1] = XMVectorAdd(evensI, i); - } - } - - //---------------------------------------------------------------------------------- - // DESCRIPTION: - // 16-sample FFT. - // - // PARAMETERS: - // pReal - [inout] real components, must have at least uCount*4 elements - // pImaginary - [inout] imaginary components, must have at least uCount*4 elements - // uCount - [in] number of FFT iterations - //---------------------------------------------------------------------------------- - __forceinline void FFT16 (_Inout_updates_(uCount*4) XMVECTOR* __restrict pReal, - _Inout_updates_(uCount*4) XMVECTOR* __restrict pImaginary, - _In_ const size_t uCount=1) - { - using namespace DirectX; - - assert(pReal); - assert(pImaginary); - assert((uintptr_t)pReal % 16 == 0); - assert((uintptr_t)pImaginary % 16 == 0); - assert(ISPOWEROF2(uCount)); - - static const XMVECTORF32 aUnityTableReal[4] = { { 1.0f, 1.0f, 1.0f, 1.0f }, - { 1.0f, 0.92387950f, 0.70710677f, 0.38268343f }, - { 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f }, - { 1.0f, 0.38268343f, -0.70710677f, -0.92387950f } }; - static const XMVECTORF32 aUnityTableImaginary[4] = { { -0.0f, -0.0f, -0.0f, -0.0f }, - { -0.0f, -0.38268343f, -0.70710677f, -0.92387950f }, - { -0.0f, -0.70710677f, -1.0f, -0.70710677f }, - { -0.0f, -0.92387950f, -0.70710677f, 0.38268343f } }; - - for (size_t uIndex=0; uIndex < uCount; ++uIndex) - { - ButterflyDIT4_4(pReal[uIndex*4], - pReal[uIndex*4 + 1], - pReal[uIndex*4 + 2], - pReal[uIndex*4 + 3], - pImaginary[uIndex*4], - pImaginary[uIndex*4 + 1], - pImaginary[uIndex*4 + 2], - pImaginary[uIndex*4 + 3], - reinterpret_cast(aUnityTableReal), - reinterpret_cast(aUnityTableImaginary), - 1, true); - } - } - - //---------------------------------------------------------------------------------- - // DESCRIPTION: - // 2^N-sample FFT. - // - // REMARKS: - // For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4(). - // - // PARAMETERS: - // pReal - [inout] real components, must have at least (uLength*uCount)/4 elements - // pImaginary - [inout] imaginary components, must have at least (uLength*uCount)/4 elements - // pUnityTable - [in] unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable() - // uLength - [in] FFT length in samples, must be a power of 2 > 16 - // uCount - [in] number of FFT iterations - //---------------------------------------------------------------------------------- - inline void FFT (_Inout_updates_((uLength*uCount)/4) XMVECTOR* __restrict pReal, - _Inout_updates_((uLength*uCount)/4) XMVECTOR* __restrict pImaginary, - _In_reads_(uLength*uCount) const XMVECTOR* __restrict pUnityTable, - _In_ const size_t uLength, - _In_ const size_t uCount=1) - { - assert(pReal); - assert(pImaginary); - assert(pUnityTable); - assert((uintptr_t)pReal % 16 == 0); - assert((uintptr_t)pImaginary % 16 == 0); - assert((uintptr_t)pUnityTable % 16 == 0); - assert(uLength > 16); - _Analysis_assume_(uLength > 16); - assert(ISPOWEROF2(uLength)); - assert(ISPOWEROF2(uCount)); - - const XMVECTOR* __restrict pUnityTableReal = pUnityTable; - const XMVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2); - const size_t uTotal = uCount * uLength; - const size_t uTotal_vectors = uTotal >> 2; - const size_t uStage_vectors = uLength >> 2; - const size_t uStage_vectors_mask = uStage_vectors - 1; - const size_t uStride = uLength >> 4; // stride between butterfly elements - const size_t uStrideMask = uStride - 1; - const size_t uStride2 = uStride * 2; - const size_t uStride3 = uStride * 3; - const size_t uStrideInvMask = ~uStrideMask; - - for (size_t uIndex=0; uIndex < (uTotal_vectors>>2); ++uIndex) - { - const size_t n = ((uIndex & uStrideInvMask) << 2) + (uIndex & uStrideMask); - ButterflyDIT4_4(pReal[n], - pReal[n + uStride], - pReal[n + uStride2], - pReal[n + uStride3], - pImaginary[n ], - pImaginary[n + uStride], - pImaginary[n + uStride2], - pImaginary[n + uStride3], - pUnityTableReal + (n & uStage_vectors_mask), - pUnityTableImaginary + (n & uStage_vectors_mask), - uStride, false); - } - - if (uLength > 16*4) - { - FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4); - } - else if (uLength == 16*4) - { - FFT16(pReal, pImaginary, uCount*4); - } - else if (uLength == 8*4) - { - FFT8(pReal, pImaginary, uCount*4); - } - else if (uLength == 4*4) - { - FFT4(pReal, pImaginary, uCount*4); - } - } - - //---------------------------------------------------------------------------------- - // DESCRIPTION: - // Initializes unity roots lookup table used by FFT functions. - // Once initialized, the table need not be initialized again unless a - // different FFT length is desired. - // - // REMARKS: - // The unity tables of FFT length 16 and below are hard coded into the - // respective FFT functions and so need not be initialized. - // - // PARAMETERS: - // pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements - // uLength - [in] FFT length in frames, must be a power of 2 > 16 - //---------------------------------------------------------------------------------- - inline void FFTInitializeUnityTable (_Out_writes_(uLength) XMVECTOR* __restrict pUnityTable, _In_ size_t uLength) - { - assert(pUnityTable); - assert(uLength > 16); - _Analysis_assume_(uLength > 16); - assert(ISPOWEROF2(uLength)); - - float* __restrict pfUnityTable = reinterpret_cast(pUnityTable); - - // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16 - do - { - float flStep = 6.283185307f / uLength; // 2PI / FFT length - uLength >>= 2; - - // pUnityTable[0 to uLength*4-1] contains real components for current FFT length - // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length - for (size_t i=0; i<4; ++i) - { - for (size_t j=0; j 16); - } - - //---------------------------------------------------------------------------------- - // DESCRIPTION: - // The FFT functions generate output in bit reversed order. - // Use this function to re-arrange them into order of increasing frequency. - // - // REMARKS: - // - // PARAMETERS: - // pOutput - [out] output buffer, receives samples in order of increasing frequency, cannot overlap pInput, must have at least (1<= 2 - //---------------------------------------------------------------------------------- - inline void FFTUnswizzle (_Out_writes_((1<= 2); - _Analysis_assume_(uLog2Length >= 2); - - float* __restrict pfOutput = (float* __restrict)pOutput; - const float* __restrict pfInput = (const float* __restrict)pInput; - const size_t uLength = size_t(1) << uLog2Length; - - if ((uLog2Length & 0x1) == 0) - { - // even powers of two - for (size_t uIndex=0; uIndex < uLength; ++uIndex) - { - size_t n = uIndex; - n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 ); - n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 ); - n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 ); - n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 ); - n >>= (32 - uLog2Length); - pfOutput[n] = pfInput[uIndex]; - } - } - else - { - // odd powers of two - for (size_t uIndex=0; uIndex < uLength; ++uIndex) - { - size_t n = (uIndex>>3); - n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 ); - n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 ); - n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 ); - n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 ); - n >>= (32 - (uLog2Length-3)); - n |= ((uIndex & 0x7) << (uLog2Length - 3)); - pfOutput[n] = pfInput[uIndex]; - } - } - } - - //---------------------------------------------------------------------------------- - // DESCRIPTION: - // Convert complex components to polar form. - // - // PARAMETERS: - // pOutput - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements - // pInputReal - [in] input buffer (real components), must have at least uLength/4 elements - // pInputImaginary - [in] input buffer (imaginary components), must have at least uLength/4 elements - // uLength - [in] FFT length in samples, must be a power of 2 >= 4 - //---------------------------------------------------------------------------------- -#pragma warning(suppress: 6101) - inline void FFTPolar (_Out_writes_(uLength/4) XMVECTOR* __restrict pOutput, - _In_reads_(uLength/4) const XMVECTOR* __restrict pInputReal, - _In_reads_(uLength/4) const XMVECTOR* __restrict pInputImaginary, - _In_ const size_t uLength) - { - using namespace DirectX; - - assert(pOutput); - assert(pInputReal); - assert(pInputImaginary); - assert(uLength >= 4); - _Analysis_assume_(uLength >= 4); - assert(ISPOWEROF2(uLength)); - - float flOneOverLength = 1.0f / uLength; - - // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2 - XMVECTOR vOneOverLength = XMVectorReplicate( flOneOverLength ); - - for (size_t uIndex=0; uIndex < (uLength>>2); ++uIndex) - { - XMVECTOR vReal = XMVectorMultiply(pInputReal[uIndex], vOneOverLength); - XMVECTOR vImaginary = XMVectorMultiply(pInputImaginary[uIndex], vOneOverLength); - XMVECTOR vRR = XMVectorMultiply(vReal, vReal); - XMVECTOR vII = XMVectorMultiply(vImaginary, vImaginary); - XMVECTOR vRRplusII = XMVectorAdd(vRR, vII); - XMVECTOR vTotal = XMVectorSqrt(vRRplusII); - pOutput[uIndex] = XMVectorAdd(vTotal, vTotal); - } - } - - //---------------------------------------------------------------------------------- - // DESCRIPTION: - // Deinterleaves audio samples - // - // REMARKS: - // For example, audio of the form [LRLRLR] becomes [LLLRRR]. - // - // PARAMETERS: - // pOutput - [out] output buffer, receives samples in deinterleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements - // pInput - [in] input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements - // uChannelCount - [in] number of channels, must be > 1 - // uFrameCount - [in] number of frames of valid data, must be > 0 - //---------------------------------------------------------------------------------- - inline void Deinterleave (_Out_writes_((uChannelCount*uFrameCount)/4) XMVECTOR* __restrict pOutput, - _In_reads_((uChannelCount*uFrameCount)/4) const XMVECTOR* __restrict pInput, - _In_ const size_t uChannelCount, - _In_ const size_t uFrameCount) - { - assert(pOutput); - assert(pInput); - assert(uChannelCount > 1); - assert(uFrameCount > 0); - - float* __restrict pfOutput = reinterpret_cast(pOutput); - const float* __restrict pfInput = reinterpret_cast(pInput); - - for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) - { - for (size_t uFrame=0; uFrame < uFrameCount; ++uFrame) - { - pfOutput[uChannel * uFrameCount + uFrame] = pfInput[uFrame * uChannelCount + uChannel]; - } - } - } - - //---------------------------------------------------------------------------------- - // DESCRIPTION: - // Interleaves audio samples - // - // REMARKS: - // For example, audio of the form [LLLRRR] becomes [LRLRLR]. - // - // PARAMETERS: - // pOutput - [out] output buffer, receives samples in interleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements - // pInput - [in] input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements - // uChannelCount - [in] number of channels, must be > 1 - // uFrameCount - [in] number of frames of valid data, must be > 0 - //---------------------------------------------------------------------------------- - inline void Interleave (_Out_writes_((uChannelCount*uFrameCount)/4) XMVECTOR* __restrict pOutput, - _In_reads_((uChannelCount*uFrameCount)/4) const XMVECTOR* __restrict pInput, - _In_ const size_t uChannelCount, - _In_ const size_t uFrameCount) - { - assert(pOutput); - assert(pInput); - assert(uChannelCount > 1); - assert(uFrameCount > 0); - - float* __restrict pfOutput = reinterpret_cast(pOutput); - const float* __restrict pfInput = reinterpret_cast(pInput); - - for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) - { - for (size_t uFrame=0; uFrame < uFrameCount; ++uFrame) - { - pfOutput[uFrame * uChannelCount + uChannel] = pfInput[uChannel * uFrameCount + uFrame]; - } - } - } - - //---------------------------------------------------------------------------------- - // DESCRIPTION: - // This function applies a 2^N-sample FFT and unswizzles the result such - // that the samples are in order of increasing frequency. - // Audio is first deinterleaved if multichannel. - // - // PARAMETERS: - // pReal - [inout] real components, must have at least (1< 0 && uChannelCount <= 6); - assert(uLog2Length >= 2 && uLog2Length <= 9); - - XMVECTOR vRealTemp[768]; - XMVECTOR vImaginaryTemp[768]; - const size_t uLength = size_t(1) << uLog2Length; - - if (uChannelCount > 1) - { - Deinterleave(vRealTemp, pReal, uChannelCount, uLength); - } - else - { - memcpy_s(vRealTemp, sizeof(vRealTemp), pReal, (uLength>>2)*sizeof(XMVECTOR)); - } - - memset( vImaginaryTemp, 0, (uChannelCount*(uLength>>2)) * sizeof(XMVECTOR) ); - - if (uLength > 16) - { - for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) - { - FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength); - } - } - else if (uLength == 16) - { - for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) - { - FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]); - } - } - else if (uLength == 8) - { - for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) - { - FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]); - } - } - else if (uLength == 4) - { - for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) - { - FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]); - } - } - - for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) - { - FFTUnswizzle(&pReal[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length); - FFTUnswizzle(&pImaginary[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], uLog2Length); - } - } - - //---------------------------------------------------------------------------------- - // DESCRIPTION: - // This function applies a 2^N-sample inverse FFT. - // Audio is interleaved if multichannel. - // - // PARAMETERS: - // pReal - [inout] real components, must have at least (1< 0 - // uLog2Length - [in] LOG (base 2) of FFT length in frames, must within [2, 9] - //---------------------------------------------------------------------------------- - inline void IFFTDeinterleaved (_Inout_updates_(((1< 0 && uChannelCount <= 6); - _Analysis_assume_(uChannelCount > 0 && uChannelCount <= 6); - assert(uLog2Length >= 2 && uLog2Length <= 9); - _Analysis_assume_(uLog2Length >= 2 && uLog2Length <= 9); - - XMVECTOR vRealTemp[768] = { 0 }; - XMVECTOR vImaginaryTemp[768] = { 0 }; - - const size_t uLength = size_t(1) << uLog2Length; - - const XMVECTOR vRnp = XMVectorReplicate(1.0f/uLength); - const XMVECTOR vRnm = XMVectorReplicate(-1.0f/uLength); - for (size_t u=0; u < uChannelCount*(uLength>>2); u++) - { - vRealTemp[u] = XMVectorMultiply(pReal[u], vRnp); - vImaginaryTemp[u] = XMVectorMultiply(pImaginary[u], vRnm); - } - - if (uLength > 16) - { - for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) - { - FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength); - } - } - else if (uLength == 16) - { - for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) - { - FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]); - } - } - else if (uLength == 8) - { - for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) - { - FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]); - } - } - else if (uLength == 4) - { - for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) - { - FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]); - } - } - - for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) - { - FFTUnswizzle(&vImaginaryTemp[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length); - } - - if (uChannelCount > 1) - { - Interleave(pReal, vImaginaryTemp, uChannelCount, uLength); - } - else - { - memcpy_s(pReal, uLength*uChannelCount*sizeof(float), vImaginaryTemp, (uLength>>2)*sizeof(XMVECTOR)); - } - } - -}; // namespace XDSP - -#pragma warning(pop) +//-------------------------------------------------------------------------------------- +// File: XDSP.h +// +// DirectXMath based Digital Signal Processing (DSP) functions for audio, +// primarily Fast Fourier Transform (FFT) +// +// All buffer parameters must be 16-byte aligned +// +// All FFT functions support only single-precision floating-point audio +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// http://go.microsoft.com/fwlink/?LinkID=615557 +//-------------------------------------------------------------------------------------- + +#pragma once + +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4668) +#include +#pragma warning(pop) + +#pragma warning(push) +#pragma warning(disable: 4328 4640 6001 6262) + +namespace XDSP +{ + #if (DIRECTXMATH_VERSION < 305) && !defined(XM_CALLCONV) + #define XM_CALLCONV __fastcall + typedef const DirectX::XMVECTOR& HXMVECTOR; + typedef const DirectX::XMMATRIX& FXMMATRIX; + #endif + + typedef DirectX::XMVECTOR XMVECTOR; + typedef DirectX::FXMVECTOR FXMVECTOR; + typedef DirectX::GXMVECTOR GXMVECTOR; + typedef DirectX::CXMVECTOR CXMVECTOR; + + inline bool ISPOWEROF2(size_t n) { return ( ((n)&((n)-1)) == 0 && (n) != 0 ); } + + // Parallel multiplication of four complex numbers, assuming real and imaginary values are stored in separate vectors. + __forceinline void XM_CALLCONV vmulComplex (_Out_ XMVECTOR& rResult, _Out_ XMVECTOR& iResult, + _In_ FXMVECTOR r1, _In_ FXMVECTOR i1, _In_ FXMVECTOR r2, _In_ GXMVECTOR i2) + { + using namespace DirectX; + // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1) + XMVECTOR vi1i2 = XMVectorMultiply(i1, i2); + XMVECTOR vr1r2 = XMVectorMultiply(r1, r2); + XMVECTOR vr1i2 = XMVectorMultiply(r1, i2); + XMVECTOR vr2i1 = XMVectorMultiply(r2, i1); + rResult = XMVectorSubtract(vr1r2, vi1i2); // real: (r1*r2 - i1*i2) + iResult = XMVectorAdd(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1) + } + + __forceinline void XM_CALLCONV vmulComplex (_Inout_ XMVECTOR& r1, _Inout_ XMVECTOR& i1, _In_ FXMVECTOR r2, _In_ FXMVECTOR i2) + { + using namespace DirectX; + // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1) + XMVECTOR vi1i2 = XMVectorMultiply(i1, i2); + XMVECTOR vr1r2 = XMVectorMultiply(r1, r2); + XMVECTOR vr1i2 = XMVectorMultiply(r1, i2); + XMVECTOR vr2i1 = XMVectorMultiply(r2, i1); + r1 = XMVectorSubtract(vr1r2, vi1i2); // real: (r1*r2 - i1*i2) + i1 = XMVectorAdd(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1) + } + + //---------------------------------------------------------------------------------- + // Radix-4 decimation-in-time FFT butterfly. + // This version assumes that all four elements of the butterfly are + // adjacent in a single vector. + // + // Compute the product of the complex input vector and the + // 4-element DFT matrix: + // | 1 1 1 1 | | (r1X,i1X) | + // | 1 -j -1 j | | (r1Y,i1Y) | + // | 1 -1 1 -1 | | (r1Z,i1Z) | + // | 1 j -1 -j | | (r1W,i1W) | + // + // This matrix can be decomposed into two simpler ones to reduce the + // number of additions needed. The decomposed matrices look like this: + // | 1 0 1 0 | | 1 0 1 0 | + // | 0 1 0 -j | | 1 0 -1 0 | + // | 1 0 -1 0 | | 0 1 0 1 | + // | 0 1 0 j | | 0 1 0 -1 | + // + // Combine as follows: + // | 1 0 1 0 | | (r1X,i1X) | | (r1X + r1Z, i1X + i1Z) | + // Temp = | 1 0 -1 0 | * | (r1Y,i1Y) | = | (r1X - r1Z, i1X - i1Z) | + // | 0 1 0 1 | | (r1Z,i1Z) | | (r1Y + r1W, i1Y + i1W) | + // | 0 1 0 -1 | | (r1W,i1W) | | (r1Y - r1W, i1Y - i1W) | + // + // | 1 0 1 0 | | (rTempX,iTempX) | | (rTempX + rTempZ, iTempX + iTempZ) | + // Result = | 0 1 0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) | + // | 1 0 -1 0 | | (rTempZ,iTempZ) | | (rTempX - rTempZ, iTempX - iTempZ) | + // | 0 1 0 j | | (rTempW,iTempW) | | (rTempY - iTempW, iTempY + rTempW) | + //---------------------------------------------------------------------------------- + __forceinline void ButterflyDIT4_1 (_Inout_ XMVECTOR& r1, _Inout_ XMVECTOR& i1) + { + using namespace DirectX; + + // sign constants for radix-4 butterflies + const static XMVECTORF32 vDFT4SignBits1 = { 1.0f, -1.0f, 1.0f, -1.0f }; + const static XMVECTORF32 vDFT4SignBits2 = { 1.0f, 1.0f, -1.0f, -1.0f }; + const static XMVECTORF32 vDFT4SignBits3 = { 1.0f, -1.0f, -1.0f, 1.0f }; + + // calculating Temp + // [r1X| r1X|r1Y| r1Y] + [r1Z|-r1Z|r1W|-r1W] + // [i1X| i1X|i1Y| i1Y] + [i1Z|-i1Z|i1W|-i1W] + XMVECTOR r1L = XMVectorSwizzle<0,0,1,1>( r1 ); + XMVECTOR r1H = XMVectorSwizzle<2,2,3,3>( r1 ); + + XMVECTOR i1L = XMVectorSwizzle<0,0,1,1>( i1 ); + XMVECTOR i1H = XMVectorSwizzle<2,2,3,3>( i1 ); + + XMVECTOR rTemp = XMVectorMultiplyAdd( r1H, vDFT4SignBits1, r1L ); + XMVECTOR iTemp = XMVectorMultiplyAdd( i1H, vDFT4SignBits1, i1L ); + + // calculating Result + XMVECTOR rZrWiZiW = XMVectorPermute<2,3,6,7>(rTemp,iTemp); // [rTempZ|rTempW|iTempZ|iTempW] + XMVECTOR rZiWrZiW = XMVectorSwizzle<0,3,0,3>(rZrWiZiW); // [rTempZ|iTempW|rTempZ|iTempW] + XMVECTOR iZrWiZrW = XMVectorSwizzle<2,1,2,1>(rZrWiZiW); // [rTempZ|iTempW|rTempZ|iTempW] + + // [rTempX| rTempY| rTempX| rTempY] + [rTempZ| iTempW|-rTempZ|-iTempW] + // [iTempX| iTempY| iTempX| iTempY] + // [iTempZ|-rTempW|-iTempZ| rTempW] + XMVECTOR rTempL = XMVectorSwizzle<0,1,0,1>(rTemp); + XMVECTOR iTempL = XMVectorSwizzle<0,1,0,1>(iTemp); + + r1 = XMVectorMultiplyAdd( rZiWrZiW, vDFT4SignBits2, rTempL ); + i1 = XMVectorMultiplyAdd( iZrWiZrW, vDFT4SignBits3, iTempL ); + } + + //---------------------------------------------------------------------------------- + // Radix-4 decimation-in-time FFT butterfly. + // This version assumes that elements of the butterfly are + // in different vectors, so that each vector in the input + // contains elements from four different butterflies. + // The four separate butterflies are processed in parallel. + // + // The calculations here are the same as the ones in the single-vector + // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W) + // they are done in parallel on sixteen independent complex values. + // There is no interdependence between the vector elements: + // | 1 0 1 0 | | (rIn0,iIn0) | | (rIn0 + rIn2, iIn0 + iIn2) | + // | 1 0 -1 0 | * | (rIn1,iIn1) | = Temp = | (rIn0 - rIn2, iIn0 - iIn2) | + // | 0 1 0 1 | | (rIn2,iIn2) | | (rIn1 + rIn3, iIn1 + iIn3) | + // | 0 1 0 -1 | | (rIn3,iIn3) | | (rIn1 - rIn3, iIn1 - iIn3) | + // + // | 1 0 1 0 | | (rTemp0,iTemp0) | | (rTemp0 + rTemp2, iTemp0 + iTemp2) | + // Result = | 0 1 0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) | + // | 1 0 -1 0 | | (rTemp2,iTemp2) | | (rTemp0 - rTemp2, iTemp0 - iTemp2) | + // | 0 1 0 j | | (rTemp3,iTemp3) | | (rTemp1 - iTemp3, iTemp1 + rTemp3) | + //---------------------------------------------------------------------------------- + __forceinline void ButterflyDIT4_4 (_Inout_ XMVECTOR& r0, + _Inout_ XMVECTOR& r1, + _Inout_ XMVECTOR& r2, + _Inout_ XMVECTOR& r3, + _Inout_ XMVECTOR& i0, + _Inout_ XMVECTOR& i1, + _Inout_ XMVECTOR& i2, + _Inout_ XMVECTOR& i3, + _In_reads_(uStride*4) const XMVECTOR* __restrict pUnityTableReal, + _In_reads_(uStride*4) const XMVECTOR* __restrict pUnityTableImaginary, + _In_ size_t uStride, + _In_ const bool fLast) + { + using namespace DirectX; + + assert(pUnityTableReal); + assert(pUnityTableImaginary); + assert((uintptr_t)pUnityTableReal % 16 == 0); + assert((uintptr_t)pUnityTableImaginary % 16 == 0); + assert(ISPOWEROF2(uStride)); + + // calculating Temp + XMVECTOR rTemp0 = XMVectorAdd(r0, r2); + XMVECTOR iTemp0 = XMVectorAdd(i0, i2); + + XMVECTOR rTemp2 = XMVectorAdd(r1, r3); + XMVECTOR iTemp2 = XMVectorAdd(i1, i3); + + XMVECTOR rTemp1 = XMVectorSubtract(r0, r2); + XMVECTOR iTemp1 = XMVectorSubtract(i0, i2); + + XMVECTOR rTemp3 = XMVectorSubtract(r1, r3); + XMVECTOR iTemp3 = XMVectorSubtract(i1, i3); + + XMVECTOR rTemp4 = XMVectorAdd(rTemp0, rTemp2); + XMVECTOR iTemp4 = XMVectorAdd(iTemp0, iTemp2); + + XMVECTOR rTemp5 = XMVectorAdd(rTemp1, iTemp3); + XMVECTOR iTemp5 = XMVectorSubtract(iTemp1, rTemp3); + + XMVECTOR rTemp6 = XMVectorSubtract(rTemp0, rTemp2); + XMVECTOR iTemp6 = XMVectorSubtract(iTemp0, iTemp2); + + XMVECTOR rTemp7 = XMVectorSubtract(rTemp1, iTemp3); + XMVECTOR iTemp7 = XMVectorAdd(iTemp1, rTemp3); + + // calculating Result + // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial + vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]); + vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride*2], pUnityTableImaginary[uStride*2]); + vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride*3], pUnityTableImaginary[uStride*3]); + + if (fLast) + { + ButterflyDIT4_1(rTemp4, iTemp4); + ButterflyDIT4_1(rTemp5, iTemp5); + ButterflyDIT4_1(rTemp6, iTemp6); + ButterflyDIT4_1(rTemp7, iTemp7); + } + + r0 = rTemp4; i0 = iTemp4; + r1 = rTemp5; i1 = iTemp5; + r2 = rTemp6; i2 = iTemp6; + r3 = rTemp7; i3 = iTemp7; + } + + //================================================================================== + // F-U-N-C-T-I-O-N-S + //================================================================================== + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // 4-sample FFT. + // + // PARAMETERS: + // pReal - [inout] real components, must have at least uCount elements + // pImaginary - [inout] imaginary components, must have at least uCount elements + // uCount - [in] number of FFT iterations + //---------------------------------------------------------------------------------- + __forceinline void FFT4(_Inout_updates_(uCount) XMVECTOR* __restrict pReal, + _Inout_updates_(uCount) XMVECTOR* __restrict pImaginary, + _In_ const size_t uCount=1) + { + assert(pReal); + assert(pImaginary); + assert((uintptr_t)pReal % 16 == 0); + assert((uintptr_t)pImaginary % 16 == 0); + assert(ISPOWEROF2(uCount)); + + for (size_t uIndex=0; uIndex < uCount; ++uIndex) + { + ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]); + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // 8-sample FFT. + // + // PARAMETERS: + // pReal - [inout] real components, must have at least uCount*2 elements + // pImaginary - [inout] imaginary components, must have at least uCount*2 elements + // uCount - [in] number of FFT iterations + //---------------------------------------------------------------------------------- + __forceinline void FFT8 (_Inout_updates_(uCount*2) XMVECTOR* __restrict pReal, + _Inout_updates_(uCount*2) XMVECTOR* __restrict pImaginary, + _In_ const size_t uCount=1) + { + using namespace DirectX; + + assert(pReal); + assert(pImaginary); + assert((uintptr_t)pReal % 16 == 0); + assert((uintptr_t)pImaginary % 16 == 0); + assert(ISPOWEROF2(uCount)); + + static const XMVECTORF32 wr1 = { 1.0f, 0.70710677f, 0.0f, -0.70710677f }; + static const XMVECTORF32 wi1 = { 0.0f, -0.70710677f, -1.0f, -0.70710677f }; + static const XMVECTORF32 wr2 = { -1.0f, -0.70710677f, 0.0f, 0.70710677f }; + static const XMVECTORF32 wi2 = { 0.0f, 0.70710677f, 1.0f, 0.70710677f }; + + for (size_t uIndex=0; uIndex < uCount; ++uIndex) + { + XMVECTOR* __restrict pR = pReal + uIndex*2; + XMVECTOR* __restrict pI = pImaginary + uIndex*2; + + XMVECTOR oddsR = XMVectorPermute<1,3,5,7>(pR[0], pR[1]); + XMVECTOR evensR = XMVectorPermute<0,2,4,6>(pR[0], pR[1]); + XMVECTOR oddsI = XMVectorPermute<1,3,5,7>(pI[0], pI[1]); + XMVECTOR evensI = XMVectorPermute<0,2,4,6>(pI[0], pI[1]); + ButterflyDIT4_1(oddsR, oddsI); + ButterflyDIT4_1(evensR, evensI); + + XMVECTOR r, i; + vmulComplex(r, i, oddsR, oddsI, wr1, wi1); + pR[0] = XMVectorAdd(evensR, r); + pI[0] = XMVectorAdd(evensI, i); + + vmulComplex(r, i, oddsR, oddsI, wr2, wi2); + pR[1] = XMVectorAdd(evensR, r); + pI[1] = XMVectorAdd(evensI, i); + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // 16-sample FFT. + // + // PARAMETERS: + // pReal - [inout] real components, must have at least uCount*4 elements + // pImaginary - [inout] imaginary components, must have at least uCount*4 elements + // uCount - [in] number of FFT iterations + //---------------------------------------------------------------------------------- + __forceinline void FFT16 (_Inout_updates_(uCount*4) XMVECTOR* __restrict pReal, + _Inout_updates_(uCount*4) XMVECTOR* __restrict pImaginary, + _In_ const size_t uCount=1) + { + using namespace DirectX; + + assert(pReal); + assert(pImaginary); + assert((uintptr_t)pReal % 16 == 0); + assert((uintptr_t)pImaginary % 16 == 0); + assert(ISPOWEROF2(uCount)); + + static const XMVECTORF32 aUnityTableReal[4] = { { 1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, 0.92387950f, 0.70710677f, 0.38268343f }, + { 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f }, + { 1.0f, 0.38268343f, -0.70710677f, -0.92387950f } }; + static const XMVECTORF32 aUnityTableImaginary[4] = { { -0.0f, -0.0f, -0.0f, -0.0f }, + { -0.0f, -0.38268343f, -0.70710677f, -0.92387950f }, + { -0.0f, -0.70710677f, -1.0f, -0.70710677f }, + { -0.0f, -0.92387950f, -0.70710677f, 0.38268343f } }; + + for (size_t uIndex=0; uIndex < uCount; ++uIndex) + { + ButterflyDIT4_4(pReal[uIndex*4], + pReal[uIndex*4 + 1], + pReal[uIndex*4 + 2], + pReal[uIndex*4 + 3], + pImaginary[uIndex*4], + pImaginary[uIndex*4 + 1], + pImaginary[uIndex*4 + 2], + pImaginary[uIndex*4 + 3], + reinterpret_cast(aUnityTableReal), + reinterpret_cast(aUnityTableImaginary), + 1, true); + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // 2^N-sample FFT. + // + // REMARKS: + // For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4(). + // + // PARAMETERS: + // pReal - [inout] real components, must have at least (uLength*uCount)/4 elements + // pImaginary - [inout] imaginary components, must have at least (uLength*uCount)/4 elements + // pUnityTable - [in] unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable() + // uLength - [in] FFT length in samples, must be a power of 2 > 16 + // uCount - [in] number of FFT iterations + //---------------------------------------------------------------------------------- + inline void FFT (_Inout_updates_((uLength*uCount)/4) XMVECTOR* __restrict pReal, + _Inout_updates_((uLength*uCount)/4) XMVECTOR* __restrict pImaginary, + _In_reads_(uLength*uCount) const XMVECTOR* __restrict pUnityTable, + _In_ const size_t uLength, + _In_ const size_t uCount=1) + { + assert(pReal); + assert(pImaginary); + assert(pUnityTable); + assert((uintptr_t)pReal % 16 == 0); + assert((uintptr_t)pImaginary % 16 == 0); + assert((uintptr_t)pUnityTable % 16 == 0); + assert(uLength > 16); + _Analysis_assume_(uLength > 16); + assert(ISPOWEROF2(uLength)); + assert(ISPOWEROF2(uCount)); + + const XMVECTOR* __restrict pUnityTableReal = pUnityTable; + const XMVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2); + const size_t uTotal = uCount * uLength; + const size_t uTotal_vectors = uTotal >> 2; + const size_t uStage_vectors = uLength >> 2; + const size_t uStage_vectors_mask = uStage_vectors - 1; + const size_t uStride = uLength >> 4; // stride between butterfly elements + const size_t uStrideMask = uStride - 1; + const size_t uStride2 = uStride * 2; + const size_t uStride3 = uStride * 3; + const size_t uStrideInvMask = ~uStrideMask; + + for (size_t uIndex=0; uIndex < (uTotal_vectors>>2); ++uIndex) + { + const size_t n = ((uIndex & uStrideInvMask) << 2) + (uIndex & uStrideMask); + ButterflyDIT4_4(pReal[n], + pReal[n + uStride], + pReal[n + uStride2], + pReal[n + uStride3], + pImaginary[n ], + pImaginary[n + uStride], + pImaginary[n + uStride2], + pImaginary[n + uStride3], + pUnityTableReal + (n & uStage_vectors_mask), + pUnityTableImaginary + (n & uStage_vectors_mask), + uStride, false); + } + + if (uLength > 16*4) + { + FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4); + } + else if (uLength == 16*4) + { + FFT16(pReal, pImaginary, uCount*4); + } + else if (uLength == 8*4) + { + FFT8(pReal, pImaginary, uCount*4); + } + else if (uLength == 4*4) + { + FFT4(pReal, pImaginary, uCount*4); + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // Initializes unity roots lookup table used by FFT functions. + // Once initialized, the table need not be initialized again unless a + // different FFT length is desired. + // + // REMARKS: + // The unity tables of FFT length 16 and below are hard coded into the + // respective FFT functions and so need not be initialized. + // + // PARAMETERS: + // pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements + // uLength - [in] FFT length in frames, must be a power of 2 > 16 + //---------------------------------------------------------------------------------- + inline void FFTInitializeUnityTable (_Out_writes_(uLength) XMVECTOR* __restrict pUnityTable, _In_ size_t uLength) + { + assert(pUnityTable); + assert(uLength > 16); + _Analysis_assume_(uLength > 16); + assert(ISPOWEROF2(uLength)); + + float* __restrict pfUnityTable = reinterpret_cast(pUnityTable); + + // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16 + do + { + float flStep = 6.283185307f / uLength; // 2PI / FFT length + uLength >>= 2; + + // pUnityTable[0 to uLength*4-1] contains real components for current FFT length + // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length + for (size_t i=0; i<4; ++i) + { + for (size_t j=0; j 16); + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // The FFT functions generate output in bit reversed order. + // Use this function to re-arrange them into order of increasing frequency. + // + // REMARKS: + // + // PARAMETERS: + // pOutput - [out] output buffer, receives samples in order of increasing frequency, cannot overlap pInput, must have at least (1<= 2 + //---------------------------------------------------------------------------------- + inline void FFTUnswizzle (_Out_writes_((1<= 2); + _Analysis_assume_(uLog2Length >= 2); + + float* __restrict pfOutput = (float* __restrict)pOutput; + const float* __restrict pfInput = (const float* __restrict)pInput; + const size_t uLength = size_t(1) << uLog2Length; + + if ((uLog2Length & 0x1) == 0) + { + // even powers of two + for (size_t uIndex=0; uIndex < uLength; ++uIndex) + { + size_t n = uIndex; + n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 ); + n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 ); + n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 ); + n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 ); + n >>= (32 - uLog2Length); + pfOutput[n] = pfInput[uIndex]; + } + } + else + { + // odd powers of two + for (size_t uIndex=0; uIndex < uLength; ++uIndex) + { + size_t n = (uIndex>>3); + n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 ); + n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 ); + n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 ); + n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 ); + n >>= (32 - (uLog2Length-3)); + n |= ((uIndex & 0x7) << (uLog2Length - 3)); + pfOutput[n] = pfInput[uIndex]; + } + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // Convert complex components to polar form. + // + // PARAMETERS: + // pOutput - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements + // pInputReal - [in] input buffer (real components), must have at least uLength/4 elements + // pInputImaginary - [in] input buffer (imaginary components), must have at least uLength/4 elements + // uLength - [in] FFT length in samples, must be a power of 2 >= 4 + //---------------------------------------------------------------------------------- +#pragma warning(suppress: 6101) + inline void FFTPolar (_Out_writes_(uLength/4) XMVECTOR* __restrict pOutput, + _In_reads_(uLength/4) const XMVECTOR* __restrict pInputReal, + _In_reads_(uLength/4) const XMVECTOR* __restrict pInputImaginary, + _In_ const size_t uLength) + { + using namespace DirectX; + + assert(pOutput); + assert(pInputReal); + assert(pInputImaginary); + assert(uLength >= 4); + _Analysis_assume_(uLength >= 4); + assert(ISPOWEROF2(uLength)); + + float flOneOverLength = 1.0f / uLength; + + // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2 + XMVECTOR vOneOverLength = XMVectorReplicate( flOneOverLength ); + + for (size_t uIndex=0; uIndex < (uLength>>2); ++uIndex) + { + XMVECTOR vReal = XMVectorMultiply(pInputReal[uIndex], vOneOverLength); + XMVECTOR vImaginary = XMVectorMultiply(pInputImaginary[uIndex], vOneOverLength); + XMVECTOR vRR = XMVectorMultiply(vReal, vReal); + XMVECTOR vII = XMVectorMultiply(vImaginary, vImaginary); + XMVECTOR vRRplusII = XMVectorAdd(vRR, vII); + XMVECTOR vTotal = XMVectorSqrt(vRRplusII); + pOutput[uIndex] = XMVectorAdd(vTotal, vTotal); + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // Deinterleaves audio samples + // + // REMARKS: + // For example, audio of the form [LRLRLR] becomes [LLLRRR]. + // + // PARAMETERS: + // pOutput - [out] output buffer, receives samples in deinterleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements + // pInput - [in] input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements + // uChannelCount - [in] number of channels, must be > 1 + // uFrameCount - [in] number of frames of valid data, must be > 0 + //---------------------------------------------------------------------------------- + inline void Deinterleave (_Out_writes_((uChannelCount*uFrameCount)/4) XMVECTOR* __restrict pOutput, + _In_reads_((uChannelCount*uFrameCount)/4) const XMVECTOR* __restrict pInput, + _In_ const size_t uChannelCount, + _In_ const size_t uFrameCount) + { + assert(pOutput); + assert(pInput); + assert(uChannelCount > 1); + assert(uFrameCount > 0); + + float* __restrict pfOutput = reinterpret_cast(pOutput); + const float* __restrict pfInput = reinterpret_cast(pInput); + + for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) + { + for (size_t uFrame=0; uFrame < uFrameCount; ++uFrame) + { + pfOutput[uChannel * uFrameCount + uFrame] = pfInput[uFrame * uChannelCount + uChannel]; + } + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // Interleaves audio samples + // + // REMARKS: + // For example, audio of the form [LLLRRR] becomes [LRLRLR]. + // + // PARAMETERS: + // pOutput - [out] output buffer, receives samples in interleaved form, cannot overlap pInput, must have at least (uChannelCount*uFrameCount)/4 elements + // pInput - [in] input buffer, cannot overlap pOutput, must have at least (uChannelCount*uFrameCount)/4 elements + // uChannelCount - [in] number of channels, must be > 1 + // uFrameCount - [in] number of frames of valid data, must be > 0 + //---------------------------------------------------------------------------------- + inline void Interleave (_Out_writes_((uChannelCount*uFrameCount)/4) XMVECTOR* __restrict pOutput, + _In_reads_((uChannelCount*uFrameCount)/4) const XMVECTOR* __restrict pInput, + _In_ const size_t uChannelCount, + _In_ const size_t uFrameCount) + { + assert(pOutput); + assert(pInput); + assert(uChannelCount > 1); + assert(uFrameCount > 0); + + float* __restrict pfOutput = reinterpret_cast(pOutput); + const float* __restrict pfInput = reinterpret_cast(pInput); + + for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) + { + for (size_t uFrame=0; uFrame < uFrameCount; ++uFrame) + { + pfOutput[uFrame * uChannelCount + uChannel] = pfInput[uChannel * uFrameCount + uFrame]; + } + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // This function applies a 2^N-sample FFT and unswizzles the result such + // that the samples are in order of increasing frequency. + // Audio is first deinterleaved if multichannel. + // + // PARAMETERS: + // pReal - [inout] real components, must have at least (1< 0 && uChannelCount <= 6); + assert(uLog2Length >= 2 && uLog2Length <= 9); + + XMVECTOR vRealTemp[768]; + XMVECTOR vImaginaryTemp[768]; + const size_t uLength = size_t(1) << uLog2Length; + + if (uChannelCount > 1) + { + Deinterleave(vRealTemp, pReal, uChannelCount, uLength); + } + else + { + memcpy_s(vRealTemp, sizeof(vRealTemp), pReal, (uLength>>2)*sizeof(XMVECTOR)); + } + + memset( vImaginaryTemp, 0, (uChannelCount*(uLength>>2)) * sizeof(XMVECTOR) ); + + if (uLength > 16) + { + for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) + { + FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength); + } + } + else if (uLength == 16) + { + for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) + { + FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]); + } + } + else if (uLength == 8) + { + for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) + { + FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]); + } + } + else if (uLength == 4) + { + for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) + { + FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]); + } + } + + for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) + { + FFTUnswizzle(&pReal[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length); + FFTUnswizzle(&pImaginary[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], uLog2Length); + } + } + + //---------------------------------------------------------------------------------- + // DESCRIPTION: + // This function applies a 2^N-sample inverse FFT. + // Audio is interleaved if multichannel. + // + // PARAMETERS: + // pReal - [inout] real components, must have at least (1< 0 + // uLog2Length - [in] LOG (base 2) of FFT length in frames, must within [2, 9] + //---------------------------------------------------------------------------------- + inline void IFFTDeinterleaved (_Inout_updates_(((1< 0 && uChannelCount <= 6); + _Analysis_assume_(uChannelCount > 0 && uChannelCount <= 6); + assert(uLog2Length >= 2 && uLog2Length <= 9); + _Analysis_assume_(uLog2Length >= 2 && uLog2Length <= 9); + + XMVECTOR vRealTemp[768] = { 0 }; + XMVECTOR vImaginaryTemp[768] = { 0 }; + + const size_t uLength = size_t(1) << uLog2Length; + + const XMVECTOR vRnp = XMVectorReplicate(1.0f/uLength); + const XMVECTOR vRnm = XMVectorReplicate(-1.0f/uLength); + for (size_t u=0; u < uChannelCount*(uLength>>2); u++) + { + vRealTemp[u] = XMVectorMultiply(pReal[u], vRnp); + vImaginaryTemp[u] = XMVectorMultiply(pImaginary[u], vRnm); + } + + if (uLength > 16) + { + for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) + { + FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength); + } + } + else if (uLength == 16) + { + for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) + { + FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]); + } + } + else if (uLength == 8) + { + for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) + { + FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]); + } + } + else if (uLength == 4) + { + for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) + { + FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]); + } + } + + for (size_t uChannel=0; uChannel < uChannelCount; ++uChannel) + { + FFTUnswizzle(&vImaginaryTemp[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length); + } + + if (uChannelCount > 1) + { + Interleave(pReal, vImaginaryTemp, uChannelCount, uLength); + } + else + { + memcpy_s(pReal, uLength*uChannelCount*sizeof(float), vImaginaryTemp, (uLength>>2)*sizeof(XMVECTOR)); + } + } + +}; // namespace XDSP + +#pragma warning(pop)